libswscale/x86/rgb2rgb_template.c
Go to the documentation of this file.
00001 /*
00002  * software RGB to RGB converter
00003  * pluralize by software PAL8 to RGB converter
00004  *              software YUV to YUV converter
00005  *              software YUV to RGB converter
00006  * Written by Nick Kurshev.
00007  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
00008  * lot of big-endian byte order fixes by Alex Beregszaszi
00009  *
00010  * This file is part of FFmpeg.
00011  *
00012  * FFmpeg is free software; you can redistribute it and/or
00013  * modify it under the terms of the GNU Lesser General Public
00014  * License as published by the Free Software Foundation; either
00015  * version 2.1 of the License, or (at your option) any later version.
00016  *
00017  * FFmpeg is distributed in the hope that it will be useful,
00018  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00020  * Lesser General Public License for more details.
00021  *
00022  * You should have received a copy of the GNU Lesser General Public
00023  * License along with FFmpeg; if not, write to the Free Software
00024  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00025  */
00026 
00027 #include <stddef.h>
00028 
00029 #undef PREFETCH
00030 #undef MOVNTQ
00031 #undef EMMS
00032 #undef SFENCE
00033 #undef PAVGB
00034 
00035 #if COMPILE_TEMPLATE_AMD3DNOW
00036 #define PREFETCH  "prefetch"
00037 #define PAVGB     "pavgusb"
00038 #elif COMPILE_TEMPLATE_MMX2
00039 #define PREFETCH "prefetchnta"
00040 #define PAVGB     "pavgb"
00041 #else
00042 #define PREFETCH  " # nop"
00043 #endif
00044 
00045 #if COMPILE_TEMPLATE_AMD3DNOW
00046 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
00047 #define EMMS     "femms"
00048 #else
00049 #define EMMS     "emms"
00050 #endif
00051 
00052 #if COMPILE_TEMPLATE_MMX2
00053 #define MOVNTQ "movntq"
00054 #define SFENCE "sfence"
00055 #else
00056 #define MOVNTQ "movq"
00057 #define SFENCE " # nop"
00058 #endif
00059 
00060 #if !COMPILE_TEMPLATE_SSE2
00061 
00062 #if !COMPILE_TEMPLATE_AMD3DNOW
00063 
00064 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
00065 {
00066     uint8_t *dest = dst;
00067     const uint8_t *s = src;
00068     const uint8_t *end;
00069     const uint8_t *mm_end;
00070     end = s + src_size;
00071     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00072     mm_end = end - 23;
00073     __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
00074     while (s < mm_end) {
00075         __asm__ volatile(
00076             PREFETCH"    32%1           \n\t"
00077             "movd          %1, %%mm0    \n\t"
00078             "punpckldq    3%1, %%mm0    \n\t"
00079             "movd         6%1, %%mm1    \n\t"
00080             "punpckldq    9%1, %%mm1    \n\t"
00081             "movd        12%1, %%mm2    \n\t"
00082             "punpckldq   15%1, %%mm2    \n\t"
00083             "movd        18%1, %%mm3    \n\t"
00084             "punpckldq   21%1, %%mm3    \n\t"
00085             "por        %%mm7, %%mm0    \n\t"
00086             "por        %%mm7, %%mm1    \n\t"
00087             "por        %%mm7, %%mm2    \n\t"
00088             "por        %%mm7, %%mm3    \n\t"
00089             MOVNTQ"     %%mm0,   %0     \n\t"
00090             MOVNTQ"     %%mm1,  8%0     \n\t"
00091             MOVNTQ"     %%mm2, 16%0     \n\t"
00092             MOVNTQ"     %%mm3, 24%0"
00093             :"=m"(*dest)
00094             :"m"(*s)
00095             :"memory");
00096         dest += 32;
00097         s += 24;
00098     }
00099     __asm__ volatile(SFENCE:::"memory");
00100     __asm__ volatile(EMMS:::"memory");
00101     while (s < end) {
00102         *dest++ = *s++;
00103         *dest++ = *s++;
00104         *dest++ = *s++;
00105         *dest++ = 255;
00106     }
00107 }
00108 
00109 #define STORE_BGR24_MMX \
00110             "psrlq         $8, %%mm2    \n\t" \
00111             "psrlq         $8, %%mm3    \n\t" \
00112             "psrlq         $8, %%mm6    \n\t" \
00113             "psrlq         $8, %%mm7    \n\t" \
00114             "pand "MANGLE(mask24l)", %%mm0\n\t" \
00115             "pand "MANGLE(mask24l)", %%mm1\n\t" \
00116             "pand "MANGLE(mask24l)", %%mm4\n\t" \
00117             "pand "MANGLE(mask24l)", %%mm5\n\t" \
00118             "pand "MANGLE(mask24h)", %%mm2\n\t" \
00119             "pand "MANGLE(mask24h)", %%mm3\n\t" \
00120             "pand "MANGLE(mask24h)", %%mm6\n\t" \
00121             "pand "MANGLE(mask24h)", %%mm7\n\t" \
00122             "por        %%mm2, %%mm0    \n\t" \
00123             "por        %%mm3, %%mm1    \n\t" \
00124             "por        %%mm6, %%mm4    \n\t" \
00125             "por        %%mm7, %%mm5    \n\t" \
00126  \
00127             "movq       %%mm1, %%mm2    \n\t" \
00128             "movq       %%mm4, %%mm3    \n\t" \
00129             "psllq        $48, %%mm2    \n\t" \
00130             "psllq        $32, %%mm3    \n\t" \
00131             "pand "MANGLE(mask24hh)", %%mm2\n\t" \
00132             "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
00133             "por        %%mm2, %%mm0    \n\t" \
00134             "psrlq        $16, %%mm1    \n\t" \
00135             "psrlq        $32, %%mm4    \n\t" \
00136             "psllq        $16, %%mm5    \n\t" \
00137             "por        %%mm3, %%mm1    \n\t" \
00138             "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
00139             "por        %%mm5, %%mm4    \n\t" \
00140  \
00141             MOVNTQ"     %%mm0,   %0     \n\t" \
00142             MOVNTQ"     %%mm1,  8%0     \n\t" \
00143             MOVNTQ"     %%mm4, 16%0"
00144 
00145 
00146 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00147 {
00148     uint8_t *dest = dst;
00149     const uint8_t *s = src;
00150     const uint8_t *end;
00151     const uint8_t *mm_end;
00152     end = s + src_size;
00153     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00154     mm_end = end - 31;
00155     while (s < mm_end) {
00156         __asm__ volatile(
00157             PREFETCH"    32%1           \n\t"
00158             "movq          %1, %%mm0    \n\t"
00159             "movq         8%1, %%mm1    \n\t"
00160             "movq        16%1, %%mm4    \n\t"
00161             "movq        24%1, %%mm5    \n\t"
00162             "movq       %%mm0, %%mm2    \n\t"
00163             "movq       %%mm1, %%mm3    \n\t"
00164             "movq       %%mm4, %%mm6    \n\t"
00165             "movq       %%mm5, %%mm7    \n\t"
00166             STORE_BGR24_MMX
00167             :"=m"(*dest)
00168             :"m"(*s)
00169             :"memory");
00170         dest += 24;
00171         s += 32;
00172     }
00173     __asm__ volatile(SFENCE:::"memory");
00174     __asm__ volatile(EMMS:::"memory");
00175     while (s < end) {
00176         *dest++ = *s++;
00177         *dest++ = *s++;
00178         *dest++ = *s++;
00179         s++;
00180     }
00181 }
00182 
00183 /*
00184  original by Strepto/Astral
00185  ported to gcc & bugfixed: A'rpi
00186  MMX2, 3DNOW optimization by Nick Kurshev
00187  32-bit C version, and and&add trick by Michael Niedermayer
00188 */
00189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
00190 {
00191     register const uint8_t* s=src;
00192     register uint8_t* d=dst;
00193     register const uint8_t *end;
00194     const uint8_t *mm_end;
00195     end = s + src_size;
00196     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
00197     __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
00198     mm_end = end - 15;
00199     while (s<mm_end) {
00200         __asm__ volatile(
00201             PREFETCH"  32%1         \n\t"
00202             "movq        %1, %%mm0  \n\t"
00203             "movq       8%1, %%mm2  \n\t"
00204             "movq     %%mm0, %%mm1  \n\t"
00205             "movq     %%mm2, %%mm3  \n\t"
00206             "pand     %%mm4, %%mm0  \n\t"
00207             "pand     %%mm4, %%mm2  \n\t"
00208             "paddw    %%mm1, %%mm0  \n\t"
00209             "paddw    %%mm3, %%mm2  \n\t"
00210             MOVNTQ"   %%mm0,  %0    \n\t"
00211             MOVNTQ"   %%mm2, 8%0"
00212             :"=m"(*d)
00213             :"m"(*s)
00214         );
00215         d+=16;
00216         s+=16;
00217     }
00218     __asm__ volatile(SFENCE:::"memory");
00219     __asm__ volatile(EMMS:::"memory");
00220     mm_end = end - 3;
00221     while (s < mm_end) {
00222         register unsigned x= *((const uint32_t *)s);
00223         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
00224         d+=4;
00225         s+=4;
00226     }
00227     if (s < end) {
00228         register unsigned short x= *((const uint16_t *)s);
00229         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
00230     }
00231 }
00232 
00233 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
00234 {
00235     register const uint8_t* s=src;
00236     register uint8_t* d=dst;
00237     register const uint8_t *end;
00238     const uint8_t *mm_end;
00239     end = s + src_size;
00240     __asm__ volatile(PREFETCH"    %0"::"m"(*s));
00241     __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
00242     __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
00243     mm_end = end - 15;
00244     while (s<mm_end) {
00245         __asm__ volatile(
00246             PREFETCH"  32%1         \n\t"
00247             "movq        %1, %%mm0  \n\t"
00248             "movq       8%1, %%mm2  \n\t"
00249             "movq     %%mm0, %%mm1  \n\t"
00250             "movq     %%mm2, %%mm3  \n\t"
00251             "psrlq       $1, %%mm0  \n\t"
00252             "psrlq       $1, %%mm2  \n\t"
00253             "pand     %%mm7, %%mm0  \n\t"
00254             "pand     %%mm7, %%mm2  \n\t"
00255             "pand     %%mm6, %%mm1  \n\t"
00256             "pand     %%mm6, %%mm3  \n\t"
00257             "por      %%mm1, %%mm0  \n\t"
00258             "por      %%mm3, %%mm2  \n\t"
00259             MOVNTQ"   %%mm0,  %0    \n\t"
00260             MOVNTQ"   %%mm2, 8%0"
00261             :"=m"(*d)
00262             :"m"(*s)
00263         );
00264         d+=16;
00265         s+=16;
00266     }
00267     __asm__ volatile(SFENCE:::"memory");
00268     __asm__ volatile(EMMS:::"memory");
00269     mm_end = end - 3;
00270     while (s < mm_end) {
00271         register uint32_t x= *((const uint32_t*)s);
00272         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
00273         s+=4;
00274         d+=4;
00275     }
00276     if (s < end) {
00277         register uint16_t x= *((const uint16_t*)s);
00278         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
00279     }
00280 }
00281 
00282 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
00283 {
00284     const uint8_t *s = src;
00285     const uint8_t *end;
00286     const uint8_t *mm_end;
00287     uint16_t *d = (uint16_t *)dst;
00288     end = s + src_size;
00289     mm_end = end - 15;
00290 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
00291     __asm__ volatile(
00292         "movq           %3, %%mm5   \n\t"
00293         "movq           %4, %%mm6   \n\t"
00294         "movq           %5, %%mm7   \n\t"
00295         "jmp 2f                     \n\t"
00296         ".p2align        4          \n\t"
00297         "1:                         \n\t"
00298         PREFETCH"   32(%1)          \n\t"
00299         "movd         (%1), %%mm0   \n\t"
00300         "movd        4(%1), %%mm3   \n\t"
00301         "punpckldq   8(%1), %%mm0   \n\t"
00302         "punpckldq  12(%1), %%mm3   \n\t"
00303         "movq        %%mm0, %%mm1   \n\t"
00304         "movq        %%mm3, %%mm4   \n\t"
00305         "pand        %%mm6, %%mm0   \n\t"
00306         "pand        %%mm6, %%mm3   \n\t"
00307         "pmaddwd     %%mm7, %%mm0   \n\t"
00308         "pmaddwd     %%mm7, %%mm3   \n\t"
00309         "pand        %%mm5, %%mm1   \n\t"
00310         "pand        %%mm5, %%mm4   \n\t"
00311         "por         %%mm1, %%mm0   \n\t"
00312         "por         %%mm4, %%mm3   \n\t"
00313         "psrld          $5, %%mm0   \n\t"
00314         "pslld         $11, %%mm3   \n\t"
00315         "por         %%mm3, %%mm0   \n\t"
00316         MOVNTQ"      %%mm0, (%0)    \n\t"
00317         "add           $16,  %1     \n\t"
00318         "add            $8,  %0     \n\t"
00319         "2:                         \n\t"
00320         "cmp            %2,  %1     \n\t"
00321         " jb            1b          \n\t"
00322         : "+r" (d), "+r"(s)
00323         : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
00324     );
00325 #else
00326     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00327     __asm__ volatile(
00328         "movq    %0, %%mm7    \n\t"
00329         "movq    %1, %%mm6    \n\t"
00330         ::"m"(red_16mask),"m"(green_16mask));
00331     while (s < mm_end) {
00332         __asm__ volatile(
00333             PREFETCH"    32%1           \n\t"
00334             "movd          %1, %%mm0    \n\t"
00335             "movd         4%1, %%mm3    \n\t"
00336             "punpckldq    8%1, %%mm0    \n\t"
00337             "punpckldq   12%1, %%mm3    \n\t"
00338             "movq       %%mm0, %%mm1    \n\t"
00339             "movq       %%mm0, %%mm2    \n\t"
00340             "movq       %%mm3, %%mm4    \n\t"
00341             "movq       %%mm3, %%mm5    \n\t"
00342             "psrlq         $3, %%mm0    \n\t"
00343             "psrlq         $3, %%mm3    \n\t"
00344             "pand          %2, %%mm0    \n\t"
00345             "pand          %2, %%mm3    \n\t"
00346             "psrlq         $5, %%mm1    \n\t"
00347             "psrlq         $5, %%mm4    \n\t"
00348             "pand       %%mm6, %%mm1    \n\t"
00349             "pand       %%mm6, %%mm4    \n\t"
00350             "psrlq         $8, %%mm2    \n\t"
00351             "psrlq         $8, %%mm5    \n\t"
00352             "pand       %%mm7, %%mm2    \n\t"
00353             "pand       %%mm7, %%mm5    \n\t"
00354             "por        %%mm1, %%mm0    \n\t"
00355             "por        %%mm4, %%mm3    \n\t"
00356             "por        %%mm2, %%mm0    \n\t"
00357             "por        %%mm5, %%mm3    \n\t"
00358             "psllq        $16, %%mm3    \n\t"
00359             "por        %%mm3, %%mm0    \n\t"
00360             MOVNTQ"     %%mm0, %0       \n\t"
00361             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00362         d += 4;
00363         s += 16;
00364     }
00365 #endif
00366     __asm__ volatile(SFENCE:::"memory");
00367     __asm__ volatile(EMMS:::"memory");
00368     while (s < end) {
00369         register int rgb = *(const uint32_t*)s; s += 4;
00370         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
00371     }
00372 }
00373 
00374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
00375 {
00376     const uint8_t *s = src;
00377     const uint8_t *end;
00378     const uint8_t *mm_end;
00379     uint16_t *d = (uint16_t *)dst;
00380     end = s + src_size;
00381     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00382     __asm__ volatile(
00383         "movq          %0, %%mm7    \n\t"
00384         "movq          %1, %%mm6    \n\t"
00385         ::"m"(red_16mask),"m"(green_16mask));
00386     mm_end = end - 15;
00387     while (s < mm_end) {
00388         __asm__ volatile(
00389             PREFETCH"    32%1           \n\t"
00390             "movd          %1, %%mm0    \n\t"
00391             "movd         4%1, %%mm3    \n\t"
00392             "punpckldq    8%1, %%mm0    \n\t"
00393             "punpckldq   12%1, %%mm3    \n\t"
00394             "movq       %%mm0, %%mm1    \n\t"
00395             "movq       %%mm0, %%mm2    \n\t"
00396             "movq       %%mm3, %%mm4    \n\t"
00397             "movq       %%mm3, %%mm5    \n\t"
00398             "psllq         $8, %%mm0    \n\t"
00399             "psllq         $8, %%mm3    \n\t"
00400             "pand       %%mm7, %%mm0    \n\t"
00401             "pand       %%mm7, %%mm3    \n\t"
00402             "psrlq         $5, %%mm1    \n\t"
00403             "psrlq         $5, %%mm4    \n\t"
00404             "pand       %%mm6, %%mm1    \n\t"
00405             "pand       %%mm6, %%mm4    \n\t"
00406             "psrlq        $19, %%mm2    \n\t"
00407             "psrlq        $19, %%mm5    \n\t"
00408             "pand          %2, %%mm2    \n\t"
00409             "pand          %2, %%mm5    \n\t"
00410             "por        %%mm1, %%mm0    \n\t"
00411             "por        %%mm4, %%mm3    \n\t"
00412             "por        %%mm2, %%mm0    \n\t"
00413             "por        %%mm5, %%mm3    \n\t"
00414             "psllq        $16, %%mm3    \n\t"
00415             "por        %%mm3, %%mm0    \n\t"
00416             MOVNTQ"     %%mm0, %0       \n\t"
00417             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00418         d += 4;
00419         s += 16;
00420     }
00421     __asm__ volatile(SFENCE:::"memory");
00422     __asm__ volatile(EMMS:::"memory");
00423     while (s < end) {
00424         register int rgb = *(const uint32_t*)s; s += 4;
00425         *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
00426     }
00427 }
00428 
00429 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
00430 {
00431     const uint8_t *s = src;
00432     const uint8_t *end;
00433     const uint8_t *mm_end;
00434     uint16_t *d = (uint16_t *)dst;
00435     end = s + src_size;
00436     mm_end = end - 15;
00437 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
00438     __asm__ volatile(
00439         "movq           %3, %%mm5   \n\t"
00440         "movq           %4, %%mm6   \n\t"
00441         "movq           %5, %%mm7   \n\t"
00442         "jmp            2f          \n\t"
00443         ".p2align        4          \n\t"
00444         "1:                         \n\t"
00445         PREFETCH"   32(%1)          \n\t"
00446         "movd         (%1), %%mm0   \n\t"
00447         "movd        4(%1), %%mm3   \n\t"
00448         "punpckldq   8(%1), %%mm0   \n\t"
00449         "punpckldq  12(%1), %%mm3   \n\t"
00450         "movq        %%mm0, %%mm1   \n\t"
00451         "movq        %%mm3, %%mm4   \n\t"
00452         "pand        %%mm6, %%mm0   \n\t"
00453         "pand        %%mm6, %%mm3   \n\t"
00454         "pmaddwd     %%mm7, %%mm0   \n\t"
00455         "pmaddwd     %%mm7, %%mm3   \n\t"
00456         "pand        %%mm5, %%mm1   \n\t"
00457         "pand        %%mm5, %%mm4   \n\t"
00458         "por         %%mm1, %%mm0   \n\t"
00459         "por         %%mm4, %%mm3   \n\t"
00460         "psrld          $6, %%mm0   \n\t"
00461         "pslld         $10, %%mm3   \n\t"
00462         "por         %%mm3, %%mm0   \n\t"
00463         MOVNTQ"      %%mm0, (%0)    \n\t"
00464         "add           $16,  %1     \n\t"
00465         "add            $8,  %0     \n\t"
00466         "2:                         \n\t"
00467         "cmp            %2,  %1     \n\t"
00468         " jb            1b          \n\t"
00469         : "+r" (d), "+r"(s)
00470         : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
00471     );
00472 #else
00473     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00474     __asm__ volatile(
00475         "movq          %0, %%mm7    \n\t"
00476         "movq          %1, %%mm6    \n\t"
00477         ::"m"(red_15mask),"m"(green_15mask));
00478     while (s < mm_end) {
00479         __asm__ volatile(
00480             PREFETCH"    32%1           \n\t"
00481             "movd          %1, %%mm0    \n\t"
00482             "movd         4%1, %%mm3    \n\t"
00483             "punpckldq    8%1, %%mm0    \n\t"
00484             "punpckldq   12%1, %%mm3    \n\t"
00485             "movq       %%mm0, %%mm1    \n\t"
00486             "movq       %%mm0, %%mm2    \n\t"
00487             "movq       %%mm3, %%mm4    \n\t"
00488             "movq       %%mm3, %%mm5    \n\t"
00489             "psrlq         $3, %%mm0    \n\t"
00490             "psrlq         $3, %%mm3    \n\t"
00491             "pand          %2, %%mm0    \n\t"
00492             "pand          %2, %%mm3    \n\t"
00493             "psrlq         $6, %%mm1    \n\t"
00494             "psrlq         $6, %%mm4    \n\t"
00495             "pand       %%mm6, %%mm1    \n\t"
00496             "pand       %%mm6, %%mm4    \n\t"
00497             "psrlq         $9, %%mm2    \n\t"
00498             "psrlq         $9, %%mm5    \n\t"
00499             "pand       %%mm7, %%mm2    \n\t"
00500             "pand       %%mm7, %%mm5    \n\t"
00501             "por        %%mm1, %%mm0    \n\t"
00502             "por        %%mm4, %%mm3    \n\t"
00503             "por        %%mm2, %%mm0    \n\t"
00504             "por        %%mm5, %%mm3    \n\t"
00505             "psllq        $16, %%mm3    \n\t"
00506             "por        %%mm3, %%mm0    \n\t"
00507             MOVNTQ"     %%mm0, %0       \n\t"
00508             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00509         d += 4;
00510         s += 16;
00511     }
00512 #endif
00513     __asm__ volatile(SFENCE:::"memory");
00514     __asm__ volatile(EMMS:::"memory");
00515     while (s < end) {
00516         register int rgb = *(const uint32_t*)s; s += 4;
00517         *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
00518     }
00519 }
00520 
00521 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
00522 {
00523     const uint8_t *s = src;
00524     const uint8_t *end;
00525     const uint8_t *mm_end;
00526     uint16_t *d = (uint16_t *)dst;
00527     end = s + src_size;
00528     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00529     __asm__ volatile(
00530         "movq          %0, %%mm7    \n\t"
00531         "movq          %1, %%mm6    \n\t"
00532         ::"m"(red_15mask),"m"(green_15mask));
00533     mm_end = end - 15;
00534     while (s < mm_end) {
00535         __asm__ volatile(
00536             PREFETCH"    32%1           \n\t"
00537             "movd          %1, %%mm0    \n\t"
00538             "movd         4%1, %%mm3    \n\t"
00539             "punpckldq    8%1, %%mm0    \n\t"
00540             "punpckldq   12%1, %%mm3    \n\t"
00541             "movq       %%mm0, %%mm1    \n\t"
00542             "movq       %%mm0, %%mm2    \n\t"
00543             "movq       %%mm3, %%mm4    \n\t"
00544             "movq       %%mm3, %%mm5    \n\t"
00545             "psllq         $7, %%mm0    \n\t"
00546             "psllq         $7, %%mm3    \n\t"
00547             "pand       %%mm7, %%mm0    \n\t"
00548             "pand       %%mm7, %%mm3    \n\t"
00549             "psrlq         $6, %%mm1    \n\t"
00550             "psrlq         $6, %%mm4    \n\t"
00551             "pand       %%mm6, %%mm1    \n\t"
00552             "pand       %%mm6, %%mm4    \n\t"
00553             "psrlq        $19, %%mm2    \n\t"
00554             "psrlq        $19, %%mm5    \n\t"
00555             "pand          %2, %%mm2    \n\t"
00556             "pand          %2, %%mm5    \n\t"
00557             "por        %%mm1, %%mm0    \n\t"
00558             "por        %%mm4, %%mm3    \n\t"
00559             "por        %%mm2, %%mm0    \n\t"
00560             "por        %%mm5, %%mm3    \n\t"
00561             "psllq        $16, %%mm3    \n\t"
00562             "por        %%mm3, %%mm0    \n\t"
00563             MOVNTQ"     %%mm0, %0       \n\t"
00564             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00565         d += 4;
00566         s += 16;
00567     }
00568     __asm__ volatile(SFENCE:::"memory");
00569     __asm__ volatile(EMMS:::"memory");
00570     while (s < end) {
00571         register int rgb = *(const uint32_t*)s; s += 4;
00572         *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
00573     }
00574 }
00575 
00576 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
00577 {
00578     const uint8_t *s = src;
00579     const uint8_t *end;
00580     const uint8_t *mm_end;
00581     uint16_t *d = (uint16_t *)dst;
00582     end = s + src_size;
00583     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00584     __asm__ volatile(
00585         "movq         %0, %%mm7     \n\t"
00586         "movq         %1, %%mm6     \n\t"
00587         ::"m"(red_16mask),"m"(green_16mask));
00588     mm_end = end - 11;
00589     while (s < mm_end) {
00590         __asm__ volatile(
00591             PREFETCH"    32%1           \n\t"
00592             "movd          %1, %%mm0    \n\t"
00593             "movd         3%1, %%mm3    \n\t"
00594             "punpckldq    6%1, %%mm0    \n\t"
00595             "punpckldq    9%1, %%mm3    \n\t"
00596             "movq       %%mm0, %%mm1    \n\t"
00597             "movq       %%mm0, %%mm2    \n\t"
00598             "movq       %%mm3, %%mm4    \n\t"
00599             "movq       %%mm3, %%mm5    \n\t"
00600             "psrlq         $3, %%mm0    \n\t"
00601             "psrlq         $3, %%mm3    \n\t"
00602             "pand          %2, %%mm0    \n\t"
00603             "pand          %2, %%mm3    \n\t"
00604             "psrlq         $5, %%mm1    \n\t"
00605             "psrlq         $5, %%mm4    \n\t"
00606             "pand       %%mm6, %%mm1    \n\t"
00607             "pand       %%mm6, %%mm4    \n\t"
00608             "psrlq         $8, %%mm2    \n\t"
00609             "psrlq         $8, %%mm5    \n\t"
00610             "pand       %%mm7, %%mm2    \n\t"
00611             "pand       %%mm7, %%mm5    \n\t"
00612             "por        %%mm1, %%mm0    \n\t"
00613             "por        %%mm4, %%mm3    \n\t"
00614             "por        %%mm2, %%mm0    \n\t"
00615             "por        %%mm5, %%mm3    \n\t"
00616             "psllq        $16, %%mm3    \n\t"
00617             "por        %%mm3, %%mm0    \n\t"
00618             MOVNTQ"     %%mm0, %0       \n\t"
00619             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00620         d += 4;
00621         s += 12;
00622     }
00623     __asm__ volatile(SFENCE:::"memory");
00624     __asm__ volatile(EMMS:::"memory");
00625     while (s < end) {
00626         const int b = *s++;
00627         const int g = *s++;
00628         const int r = *s++;
00629         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00630     }
00631 }
00632 
00633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
00634 {
00635     const uint8_t *s = src;
00636     const uint8_t *end;
00637     const uint8_t *mm_end;
00638     uint16_t *d = (uint16_t *)dst;
00639     end = s + src_size;
00640     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00641     __asm__ volatile(
00642         "movq         %0, %%mm7     \n\t"
00643         "movq         %1, %%mm6     \n\t"
00644         ::"m"(red_16mask),"m"(green_16mask));
00645     mm_end = end - 15;
00646     while (s < mm_end) {
00647         __asm__ volatile(
00648             PREFETCH"    32%1           \n\t"
00649             "movd          %1, %%mm0    \n\t"
00650             "movd         3%1, %%mm3    \n\t"
00651             "punpckldq    6%1, %%mm0    \n\t"
00652             "punpckldq    9%1, %%mm3    \n\t"
00653             "movq       %%mm0, %%mm1    \n\t"
00654             "movq       %%mm0, %%mm2    \n\t"
00655             "movq       %%mm3, %%mm4    \n\t"
00656             "movq       %%mm3, %%mm5    \n\t"
00657             "psllq         $8, %%mm0    \n\t"
00658             "psllq         $8, %%mm3    \n\t"
00659             "pand       %%mm7, %%mm0    \n\t"
00660             "pand       %%mm7, %%mm3    \n\t"
00661             "psrlq         $5, %%mm1    \n\t"
00662             "psrlq         $5, %%mm4    \n\t"
00663             "pand       %%mm6, %%mm1    \n\t"
00664             "pand       %%mm6, %%mm4    \n\t"
00665             "psrlq        $19, %%mm2    \n\t"
00666             "psrlq        $19, %%mm5    \n\t"
00667             "pand          %2, %%mm2    \n\t"
00668             "pand          %2, %%mm5    \n\t"
00669             "por        %%mm1, %%mm0    \n\t"
00670             "por        %%mm4, %%mm3    \n\t"
00671             "por        %%mm2, %%mm0    \n\t"
00672             "por        %%mm5, %%mm3    \n\t"
00673             "psllq        $16, %%mm3    \n\t"
00674             "por        %%mm3, %%mm0    \n\t"
00675             MOVNTQ"     %%mm0, %0       \n\t"
00676             :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
00677         d += 4;
00678         s += 12;
00679     }
00680     __asm__ volatile(SFENCE:::"memory");
00681     __asm__ volatile(EMMS:::"memory");
00682     while (s < end) {
00683         const int r = *s++;
00684         const int g = *s++;
00685         const int b = *s++;
00686         *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00687     }
00688 }
00689 
00690 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
00691 {
00692     const uint8_t *s = src;
00693     const uint8_t *end;
00694     const uint8_t *mm_end;
00695     uint16_t *d = (uint16_t *)dst;
00696     end = s + src_size;
00697     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00698     __asm__ volatile(
00699         "movq          %0, %%mm7    \n\t"
00700         "movq          %1, %%mm6    \n\t"
00701         ::"m"(red_15mask),"m"(green_15mask));
00702     mm_end = end - 11;
00703     while (s < mm_end) {
00704         __asm__ volatile(
00705             PREFETCH"    32%1           \n\t"
00706             "movd          %1, %%mm0    \n\t"
00707             "movd         3%1, %%mm3    \n\t"
00708             "punpckldq    6%1, %%mm0    \n\t"
00709             "punpckldq    9%1, %%mm3    \n\t"
00710             "movq       %%mm0, %%mm1    \n\t"
00711             "movq       %%mm0, %%mm2    \n\t"
00712             "movq       %%mm3, %%mm4    \n\t"
00713             "movq       %%mm3, %%mm5    \n\t"
00714             "psrlq         $3, %%mm0    \n\t"
00715             "psrlq         $3, %%mm3    \n\t"
00716             "pand          %2, %%mm0    \n\t"
00717             "pand          %2, %%mm3    \n\t"
00718             "psrlq         $6, %%mm1    \n\t"
00719             "psrlq         $6, %%mm4    \n\t"
00720             "pand       %%mm6, %%mm1    \n\t"
00721             "pand       %%mm6, %%mm4    \n\t"
00722             "psrlq         $9, %%mm2    \n\t"
00723             "psrlq         $9, %%mm5    \n\t"
00724             "pand       %%mm7, %%mm2    \n\t"
00725             "pand       %%mm7, %%mm5    \n\t"
00726             "por        %%mm1, %%mm0    \n\t"
00727             "por        %%mm4, %%mm3    \n\t"
00728             "por        %%mm2, %%mm0    \n\t"
00729             "por        %%mm5, %%mm3    \n\t"
00730             "psllq        $16, %%mm3    \n\t"
00731             "por        %%mm3, %%mm0    \n\t"
00732             MOVNTQ"     %%mm0, %0       \n\t"
00733             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00734         d += 4;
00735         s += 12;
00736     }
00737     __asm__ volatile(SFENCE:::"memory");
00738     __asm__ volatile(EMMS:::"memory");
00739     while (s < end) {
00740         const int b = *s++;
00741         const int g = *s++;
00742         const int r = *s++;
00743         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00744     }
00745 }
00746 
00747 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
00748 {
00749     const uint8_t *s = src;
00750     const uint8_t *end;
00751     const uint8_t *mm_end;
00752     uint16_t *d = (uint16_t *)dst;
00753     end = s + src_size;
00754     __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
00755     __asm__ volatile(
00756         "movq         %0, %%mm7     \n\t"
00757         "movq         %1, %%mm6     \n\t"
00758         ::"m"(red_15mask),"m"(green_15mask));
00759     mm_end = end - 15;
00760     while (s < mm_end) {
00761         __asm__ volatile(
00762             PREFETCH"   32%1            \n\t"
00763             "movd         %1, %%mm0     \n\t"
00764             "movd        3%1, %%mm3     \n\t"
00765             "punpckldq   6%1, %%mm0     \n\t"
00766             "punpckldq   9%1, %%mm3     \n\t"
00767             "movq      %%mm0, %%mm1     \n\t"
00768             "movq      %%mm0, %%mm2     \n\t"
00769             "movq      %%mm3, %%mm4     \n\t"
00770             "movq      %%mm3, %%mm5     \n\t"
00771             "psllq        $7, %%mm0     \n\t"
00772             "psllq        $7, %%mm3     \n\t"
00773             "pand      %%mm7, %%mm0     \n\t"
00774             "pand      %%mm7, %%mm3     \n\t"
00775             "psrlq        $6, %%mm1     \n\t"
00776             "psrlq        $6, %%mm4     \n\t"
00777             "pand      %%mm6, %%mm1     \n\t"
00778             "pand      %%mm6, %%mm4     \n\t"
00779             "psrlq       $19, %%mm2     \n\t"
00780             "psrlq       $19, %%mm5     \n\t"
00781             "pand         %2, %%mm2     \n\t"
00782             "pand         %2, %%mm5     \n\t"
00783             "por       %%mm1, %%mm0     \n\t"
00784             "por       %%mm4, %%mm3     \n\t"
00785             "por       %%mm2, %%mm0     \n\t"
00786             "por       %%mm5, %%mm3     \n\t"
00787             "psllq       $16, %%mm3     \n\t"
00788             "por       %%mm3, %%mm0     \n\t"
00789             MOVNTQ"    %%mm0, %0        \n\t"
00790             :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
00791         d += 4;
00792         s += 12;
00793     }
00794     __asm__ volatile(SFENCE:::"memory");
00795     __asm__ volatile(EMMS:::"memory");
00796     while (s < end) {
00797         const int r = *s++;
00798         const int g = *s++;
00799         const int b = *s++;
00800         *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00801     }
00802 }
00803 
00804 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00805 {
00806     const uint16_t *end;
00807     const uint16_t *mm_end;
00808     uint8_t *d = dst;
00809     const uint16_t *s = (const uint16_t*)src;
00810     end = s + src_size/2;
00811     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00812     mm_end = end - 7;
00813     while (s < mm_end) {
00814         __asm__ volatile(
00815             PREFETCH"    32%1           \n\t"
00816             "movq          %1, %%mm0    \n\t"
00817             "movq          %1, %%mm1    \n\t"
00818             "movq          %1, %%mm2    \n\t"
00819             "pand          %2, %%mm0    \n\t"
00820             "pand          %3, %%mm1    \n\t"
00821             "pand          %4, %%mm2    \n\t"
00822             "psllq         $5, %%mm0    \n\t"
00823             "pmulhw        %6, %%mm0    \n\t"
00824             "pmulhw        %6, %%mm1    \n\t"
00825             "pmulhw        %7, %%mm2    \n\t"
00826             "movq       %%mm0, %%mm3    \n\t"
00827             "movq       %%mm1, %%mm4    \n\t"
00828             "movq       %%mm2, %%mm5    \n\t"
00829             "punpcklwd     %5, %%mm0    \n\t"
00830             "punpcklwd     %5, %%mm1    \n\t"
00831             "punpcklwd     %5, %%mm2    \n\t"
00832             "punpckhwd     %5, %%mm3    \n\t"
00833             "punpckhwd     %5, %%mm4    \n\t"
00834             "punpckhwd     %5, %%mm5    \n\t"
00835             "psllq         $8, %%mm1    \n\t"
00836             "psllq        $16, %%mm2    \n\t"
00837             "por        %%mm1, %%mm0    \n\t"
00838             "por        %%mm2, %%mm0    \n\t"
00839             "psllq         $8, %%mm4    \n\t"
00840             "psllq        $16, %%mm5    \n\t"
00841             "por        %%mm4, %%mm3    \n\t"
00842             "por        %%mm5, %%mm3    \n\t"
00843 
00844             "movq       %%mm0, %%mm6    \n\t"
00845             "movq       %%mm3, %%mm7    \n\t"
00846 
00847             "movq         8%1, %%mm0    \n\t"
00848             "movq         8%1, %%mm1    \n\t"
00849             "movq         8%1, %%mm2    \n\t"
00850             "pand          %2, %%mm0    \n\t"
00851             "pand          %3, %%mm1    \n\t"
00852             "pand          %4, %%mm2    \n\t"
00853             "psllq         $5, %%mm0    \n\t"
00854             "pmulhw        %6, %%mm0    \n\t"
00855             "pmulhw        %6, %%mm1    \n\t"
00856             "pmulhw        %7, %%mm2    \n\t"
00857             "movq       %%mm0, %%mm3    \n\t"
00858             "movq       %%mm1, %%mm4    \n\t"
00859             "movq       %%mm2, %%mm5    \n\t"
00860             "punpcklwd     %5, %%mm0    \n\t"
00861             "punpcklwd     %5, %%mm1    \n\t"
00862             "punpcklwd     %5, %%mm2    \n\t"
00863             "punpckhwd     %5, %%mm3    \n\t"
00864             "punpckhwd     %5, %%mm4    \n\t"
00865             "punpckhwd     %5, %%mm5    \n\t"
00866             "psllq         $8, %%mm1    \n\t"
00867             "psllq        $16, %%mm2    \n\t"
00868             "por        %%mm1, %%mm0    \n\t"
00869             "por        %%mm2, %%mm0    \n\t"
00870             "psllq         $8, %%mm4    \n\t"
00871             "psllq        $16, %%mm5    \n\t"
00872             "por        %%mm4, %%mm3    \n\t"
00873             "por        %%mm5, %%mm3    \n\t"
00874 
00875             :"=m"(*d)
00876             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mmx_null),"m"(mul15_mid),"m"(mul15_hi)
00877             :"memory");
00878         /* borrowed 32 to 24 */
00879         __asm__ volatile(
00880             "movq       %%mm0, %%mm4    \n\t"
00881             "movq       %%mm3, %%mm5    \n\t"
00882             "movq       %%mm6, %%mm0    \n\t"
00883             "movq       %%mm7, %%mm1    \n\t"
00884 
00885             "movq       %%mm4, %%mm6    \n\t"
00886             "movq       %%mm5, %%mm7    \n\t"
00887             "movq       %%mm0, %%mm2    \n\t"
00888             "movq       %%mm1, %%mm3    \n\t"
00889 
00890             STORE_BGR24_MMX
00891 
00892             :"=m"(*d)
00893             :"m"(*s)
00894             :"memory");
00895         d += 24;
00896         s += 8;
00897     }
00898     __asm__ volatile(SFENCE:::"memory");
00899     __asm__ volatile(EMMS:::"memory");
00900     while (s < end) {
00901         register uint16_t bgr;
00902         bgr = *s++;
00903         *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
00904         *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
00905         *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
00906     }
00907 }
00908 
00909 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00910 {
00911     const uint16_t *end;
00912     const uint16_t *mm_end;
00913     uint8_t *d = (uint8_t *)dst;
00914     const uint16_t *s = (const uint16_t *)src;
00915     end = s + src_size/2;
00916     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
00917     mm_end = end - 7;
00918     while (s < mm_end) {
00919         __asm__ volatile(
00920             PREFETCH"    32%1           \n\t"
00921             "movq          %1, %%mm0    \n\t"
00922             "movq          %1, %%mm1    \n\t"
00923             "movq          %1, %%mm2    \n\t"
00924             "pand          %2, %%mm0    \n\t"
00925             "pand          %3, %%mm1    \n\t"
00926             "pand          %4, %%mm2    \n\t"
00927             "psllq         $5, %%mm0    \n\t"
00928             "psrlq         $1, %%mm2    \n\t"
00929             "pmulhw        %6, %%mm0    \n\t"
00930             "pmulhw        %8, %%mm1    \n\t"
00931             "pmulhw        %7, %%mm2    \n\t"
00932             "movq       %%mm0, %%mm3    \n\t"
00933             "movq       %%mm1, %%mm4    \n\t"
00934             "movq       %%mm2, %%mm5    \n\t"
00935             "punpcklwd     %5, %%mm0    \n\t"
00936             "punpcklwd     %5, %%mm1    \n\t"
00937             "punpcklwd     %5, %%mm2    \n\t"
00938             "punpckhwd     %5, %%mm3    \n\t"
00939             "punpckhwd     %5, %%mm4    \n\t"
00940             "punpckhwd     %5, %%mm5    \n\t"
00941             "psllq         $8, %%mm1    \n\t"
00942             "psllq        $16, %%mm2    \n\t"
00943             "por        %%mm1, %%mm0    \n\t"
00944             "por        %%mm2, %%mm0    \n\t"
00945             "psllq         $8, %%mm4    \n\t"
00946             "psllq        $16, %%mm5    \n\t"
00947             "por        %%mm4, %%mm3    \n\t"
00948             "por        %%mm5, %%mm3    \n\t"
00949 
00950             "movq       %%mm0, %%mm6    \n\t"
00951             "movq       %%mm3, %%mm7    \n\t"
00952 
00953             "movq         8%1, %%mm0    \n\t"
00954             "movq         8%1, %%mm1    \n\t"
00955             "movq         8%1, %%mm2    \n\t"
00956             "pand          %2, %%mm0    \n\t"
00957             "pand          %3, %%mm1    \n\t"
00958             "pand          %4, %%mm2    \n\t"
00959             "psllq         $5, %%mm0    \n\t"
00960             "psrlq         $1, %%mm2    \n\t"
00961             "pmulhw        %6, %%mm0    \n\t"
00962             "pmulhw        %8, %%mm1    \n\t"
00963             "pmulhw        %7, %%mm2    \n\t"
00964             "movq       %%mm0, %%mm3    \n\t"
00965             "movq       %%mm1, %%mm4    \n\t"
00966             "movq       %%mm2, %%mm5    \n\t"
00967             "punpcklwd     %5, %%mm0    \n\t"
00968             "punpcklwd     %5, %%mm1    \n\t"
00969             "punpcklwd     %5, %%mm2    \n\t"
00970             "punpckhwd     %5, %%mm3    \n\t"
00971             "punpckhwd     %5, %%mm4    \n\t"
00972             "punpckhwd     %5, %%mm5    \n\t"
00973             "psllq         $8, %%mm1    \n\t"
00974             "psllq        $16, %%mm2    \n\t"
00975             "por        %%mm1, %%mm0    \n\t"
00976             "por        %%mm2, %%mm0    \n\t"
00977             "psllq         $8, %%mm4    \n\t"
00978             "psllq        $16, %%mm5    \n\t"
00979             "por        %%mm4, %%mm3    \n\t"
00980             "por        %%mm5, %%mm3    \n\t"
00981             :"=m"(*d)
00982             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null),"m"(mul15_mid),"m"(mul15_hi),"m"(mul16_mid)
00983             :"memory");
00984         /* borrowed 32 to 24 */
00985         __asm__ volatile(
00986             "movq       %%mm0, %%mm4    \n\t"
00987             "movq       %%mm3, %%mm5    \n\t"
00988             "movq       %%mm6, %%mm0    \n\t"
00989             "movq       %%mm7, %%mm1    \n\t"
00990 
00991             "movq       %%mm4, %%mm6    \n\t"
00992             "movq       %%mm5, %%mm7    \n\t"
00993             "movq       %%mm0, %%mm2    \n\t"
00994             "movq       %%mm1, %%mm3    \n\t"
00995 
00996             STORE_BGR24_MMX
00997 
00998             :"=m"(*d)
00999             :"m"(*s)
01000             :"memory");
01001         d += 24;
01002         s += 8;
01003     }
01004     __asm__ volatile(SFENCE:::"memory");
01005     __asm__ volatile(EMMS:::"memory");
01006     while (s < end) {
01007         register uint16_t bgr;
01008         bgr = *s++;
01009         *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
01010         *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
01011         *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
01012     }
01013 }
01014 
01015 /*
01016  * mm0 = 00 B3 00 B2 00 B1 00 B0
01017  * mm1 = 00 G3 00 G2 00 G1 00 G0
01018  * mm2 = 00 R3 00 R2 00 R1 00 R0
01019  * mm6 = FF FF FF FF FF FF FF FF
01020  * mm7 = 00 00 00 00 00 00 00 00
01021  */
01022 #define PACK_RGB32 \
01023     "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
01024     "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
01025     "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
01026     "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
01027     "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
01028     "movq       %%mm0, %%mm3    \n\t"                               \
01029     "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
01030     "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
01031     MOVNTQ"     %%mm0,  %0      \n\t"                               \
01032     MOVNTQ"     %%mm3, 8%0      \n\t"                               \
01033 
01034 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
01035 {
01036     const uint16_t *end;
01037     const uint16_t *mm_end;
01038     uint8_t *d = dst;
01039     const uint16_t *s = (const uint16_t *)src;
01040     end = s + src_size/2;
01041     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
01042     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
01043     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
01044     mm_end = end - 3;
01045     while (s < mm_end) {
01046         __asm__ volatile(
01047             PREFETCH"    32%1           \n\t"
01048             "movq          %1, %%mm0    \n\t"
01049             "movq          %1, %%mm1    \n\t"
01050             "movq          %1, %%mm2    \n\t"
01051             "pand          %2, %%mm0    \n\t"
01052             "pand          %3, %%mm1    \n\t"
01053             "pand          %4, %%mm2    \n\t"
01054             "psllq         $5, %%mm0    \n\t"
01055             "pmulhw        %5, %%mm0    \n\t"
01056             "pmulhw        %5, %%mm1    \n\t"
01057             "pmulhw        %6, %%mm2    \n\t"
01058             PACK_RGB32
01059             :"=m"(*d)
01060             :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mul15_mid),"m"(mul15_hi)
01061             :"memory");
01062         d += 16;
01063         s += 4;
01064     }
01065     __asm__ volatile(SFENCE:::"memory");
01066     __asm__ volatile(EMMS:::"memory");
01067     while (s < end) {
01068         register uint16_t bgr;
01069         bgr = *s++;
01070         *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
01071         *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
01072         *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
01073         *d++ = 255;
01074     }
01075 }
01076 
01077 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
01078 {
01079     const uint16_t *end;
01080     const uint16_t *mm_end;
01081     uint8_t *d = dst;
01082     const uint16_t *s = (const uint16_t*)src;
01083     end = s + src_size/2;
01084     __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
01085     __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
01086     __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
01087     mm_end = end - 3;
01088     while (s < mm_end) {
01089         __asm__ volatile(
01090             PREFETCH"    32%1           \n\t"
01091             "movq          %1, %%mm0    \n\t"
01092             "movq          %1, %%mm1    \n\t"
01093             "movq          %1, %%mm2    \n\t"
01094             "pand          %2, %%mm0    \n\t"
01095             "pand          %3, %%mm1    \n\t"
01096             "pand          %4, %%mm2    \n\t"
01097             "psllq         $5, %%mm0    \n\t"
01098             "psrlq         $1, %%mm2    \n\t"
01099             "pmulhw        %5, %%mm0    \n\t"
01100             "pmulhw        %7, %%mm1    \n\t"
01101             "pmulhw        %6, %%mm2    \n\t"
01102             PACK_RGB32
01103             :"=m"(*d)
01104             :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid),"m"(mul15_hi),"m"(mul16_mid)
01105             :"memory");
01106         d += 16;
01107         s += 4;
01108     }
01109     __asm__ volatile(SFENCE:::"memory");
01110     __asm__ volatile(EMMS:::"memory");
01111     while (s < end) {
01112         register uint16_t bgr;
01113         bgr = *s++;
01114         *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
01115         *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
01116         *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
01117         *d++ = 255;
01118     }
01119 }
01120 
01121 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
01122 {
01123     x86_reg idx = 15 - src_size;
01124     const uint8_t *s = src-idx;
01125     uint8_t *d = dst-idx;
01126     __asm__ volatile(
01127         "test          %0, %0           \n\t"
01128         "jns           2f               \n\t"
01129         PREFETCH"       (%1, %0)        \n\t"
01130         "movq          %3, %%mm7        \n\t"
01131         "pxor          %4, %%mm7        \n\t"
01132         "movq       %%mm7, %%mm6        \n\t"
01133         "pxor          %5, %%mm7        \n\t"
01134         ".p2align       4               \n\t"
01135         "1:                             \n\t"
01136         PREFETCH"     32(%1, %0)        \n\t"
01137         "movq           (%1, %0), %%mm0 \n\t"
01138         "movq          8(%1, %0), %%mm1 \n\t"
01139 # if COMPILE_TEMPLATE_MMX2
01140         "pshufw      $177, %%mm0, %%mm3 \n\t"
01141         "pshufw      $177, %%mm1, %%mm5 \n\t"
01142         "pand       %%mm7, %%mm0        \n\t"
01143         "pand       %%mm6, %%mm3        \n\t"
01144         "pand       %%mm7, %%mm1        \n\t"
01145         "pand       %%mm6, %%mm5        \n\t"
01146         "por        %%mm3, %%mm0        \n\t"
01147         "por        %%mm5, %%mm1        \n\t"
01148 # else
01149         "movq       %%mm0, %%mm2        \n\t"
01150         "movq       %%mm1, %%mm4        \n\t"
01151         "pand       %%mm7, %%mm0        \n\t"
01152         "pand       %%mm6, %%mm2        \n\t"
01153         "pand       %%mm7, %%mm1        \n\t"
01154         "pand       %%mm6, %%mm4        \n\t"
01155         "movq       %%mm2, %%mm3        \n\t"
01156         "movq       %%mm4, %%mm5        \n\t"
01157         "pslld        $16, %%mm2        \n\t"
01158         "psrld        $16, %%mm3        \n\t"
01159         "pslld        $16, %%mm4        \n\t"
01160         "psrld        $16, %%mm5        \n\t"
01161         "por        %%mm2, %%mm0        \n\t"
01162         "por        %%mm4, %%mm1        \n\t"
01163         "por        %%mm3, %%mm0        \n\t"
01164         "por        %%mm5, %%mm1        \n\t"
01165 # endif
01166         MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
01167         MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
01168         "add          $16, %0           \n\t"
01169         "js            1b               \n\t"
01170         SFENCE"                         \n\t"
01171         EMMS"                           \n\t"
01172         "2:                             \n\t"
01173         : "+&r"(idx)
01174         : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
01175         : "memory");
01176     for (; idx<15; idx+=4) {
01177         register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
01178         v &= 0xff00ff;
01179         *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
01180     }
01181 }
01182 
01183 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
01184 {
01185     unsigned i;
01186     x86_reg mmx_size= 23 - src_size;
01187     __asm__ volatile (
01188         "test             %%"REG_a", %%"REG_a"          \n\t"
01189         "jns                     2f                     \n\t"
01190         "movq     "MANGLE(mask24r)", %%mm5              \n\t"
01191         "movq     "MANGLE(mask24g)", %%mm6              \n\t"
01192         "movq     "MANGLE(mask24b)", %%mm7              \n\t"
01193         ".p2align                 4                     \n\t"
01194         "1:                                             \n\t"
01195         PREFETCH" 32(%1, %%"REG_a")                     \n\t"
01196         "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
01197         "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
01198         "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
01199         "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
01200         "pand                 %%mm5, %%mm0              \n\t"
01201         "pand                 %%mm6, %%mm1              \n\t"
01202         "pand                 %%mm7, %%mm2              \n\t"
01203         "por                  %%mm0, %%mm1              \n\t"
01204         "por                  %%mm2, %%mm1              \n\t"
01205         "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
01206         MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
01207         "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
01208         "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
01209         "pand                 %%mm7, %%mm0              \n\t"
01210         "pand                 %%mm5, %%mm1              \n\t"
01211         "pand                 %%mm6, %%mm2              \n\t"
01212         "por                  %%mm0, %%mm1              \n\t"
01213         "por                  %%mm2, %%mm1              \n\t"
01214         "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
01215         MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
01216         "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
01217         "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
01218         "pand                 %%mm6, %%mm0              \n\t"
01219         "pand                 %%mm7, %%mm1              \n\t"
01220         "pand                 %%mm5, %%mm2              \n\t"
01221         "por                  %%mm0, %%mm1              \n\t"
01222         "por                  %%mm2, %%mm1              \n\t"
01223         MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
01224         "add                    $24, %%"REG_a"          \n\t"
01225         " js                     1b                     \n\t"
01226         "2:                                             \n\t"
01227         : "+a" (mmx_size)
01228         : "r" (src-mmx_size), "r"(dst-mmx_size)
01229     );
01230 
01231     __asm__ volatile(SFENCE:::"memory");
01232     __asm__ volatile(EMMS:::"memory");
01233 
01234     if (mmx_size==23) return; //finished, was multiple of 8
01235 
01236     src+= src_size;
01237     dst+= src_size;
01238     src_size= 23-mmx_size;
01239     src-= src_size;
01240     dst-= src_size;
01241     for (i=0; i<src_size; i+=3) {
01242         register uint8_t x;
01243         x          = src[i + 2];
01244         dst[i + 1] = src[i + 1];
01245         dst[i + 2] = src[i + 0];
01246         dst[i + 0] = x;
01247     }
01248 }
01249 
01250 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01251                                            int width, int height,
01252                                            int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01253 {
01254     int y;
01255     const x86_reg chromWidth= width>>1;
01256     for (y=0; y<height; y++) {
01257         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
01258         __asm__ volatile(
01259             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01260             ".p2align                    4              \n\t"
01261             "1:                                         \n\t"
01262             PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
01263             PREFETCH"    32(%2, %%"REG_a")              \n\t"
01264             PREFETCH"    32(%3, %%"REG_a")              \n\t"
01265             "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
01266             "movq                    %%mm0, %%mm2       \n\t" // U(0)
01267             "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
01268             "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
01269             "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
01270 
01271             "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
01272             "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
01273             "movq                    %%mm3, %%mm4       \n\t" // Y(0)
01274             "movq                    %%mm5, %%mm6       \n\t" // Y(8)
01275             "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
01276             "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
01277             "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
01278             "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
01279 
01280             MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
01281             MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
01282             MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
01283             MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
01284 
01285             "add                        $8, %%"REG_a"   \n\t"
01286             "cmp                        %4, %%"REG_a"   \n\t"
01287             " jb                        1b              \n\t"
01288             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01289             : "%"REG_a
01290         );
01291         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
01292             usrc += chromStride;
01293             vsrc += chromStride;
01294         }
01295         ysrc += lumStride;
01296         dst  += dstStride;
01297     }
01298     __asm__(EMMS"       \n\t"
01299             SFENCE"     \n\t"
01300             :::"memory");
01301 }
01302 
01307 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01308                                       int width, int height,
01309                                       int lumStride, int chromStride, int dstStride)
01310 {
01311     //FIXME interpolate chroma
01312     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01313 }
01314 
01315 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01316                                            int width, int height,
01317                                            int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01318 {
01319     int y;
01320     const x86_reg chromWidth= width>>1;
01321     for (y=0; y<height; y++) {
01322         //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
01323         __asm__ volatile(
01324             "xor                %%"REG_a", %%"REG_a"    \n\t"
01325             ".p2align                   4               \n\t"
01326             "1:                                         \n\t"
01327             PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
01328             PREFETCH"   32(%2, %%"REG_a")               \n\t"
01329             PREFETCH"   32(%3, %%"REG_a")               \n\t"
01330             "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
01331             "movq                   %%mm0, %%mm2        \n\t" // U(0)
01332             "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
01333             "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
01334             "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
01335 
01336             "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
01337             "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
01338             "movq                   %%mm0, %%mm4        \n\t" // Y(0)
01339             "movq                   %%mm2, %%mm6        \n\t" // Y(8)
01340             "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
01341             "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
01342             "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
01343             "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
01344 
01345             MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
01346             MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
01347             MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
01348             MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
01349 
01350             "add                       $8, %%"REG_a"    \n\t"
01351             "cmp                       %4, %%"REG_a"    \n\t"
01352             " jb                       1b               \n\t"
01353             ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01354             : "%"REG_a
01355         );
01356         if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
01357             usrc += chromStride;
01358             vsrc += chromStride;
01359         }
01360         ysrc += lumStride;
01361         dst += dstStride;
01362     }
01363     __asm__(EMMS"       \n\t"
01364             SFENCE"     \n\t"
01365             :::"memory");
01366 }
01367 
01372 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01373                                       int width, int height,
01374                                       int lumStride, int chromStride, int dstStride)
01375 {
01376     //FIXME interpolate chroma
01377     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01378 }
01379 
01383 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01384                                          int width, int height,
01385                                          int lumStride, int chromStride, int dstStride)
01386 {
01387     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01388 }
01389 
01393 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01394                                          int width, int height,
01395                                          int lumStride, int chromStride, int dstStride)
01396 {
01397     RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01398 }
01399 
01404 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01405                                       int width, int height,
01406                                       int lumStride, int chromStride, int srcStride)
01407 {
01408     int y;
01409     const x86_reg chromWidth= width>>1;
01410     for (y=0; y<height; y+=2) {
01411         __asm__ volatile(
01412             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01413             "pcmpeqw                 %%mm7, %%mm7       \n\t"
01414             "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
01415             ".p2align                    4              \n\t"
01416             "1:                \n\t"
01417             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
01418             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
01419             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
01420             "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
01421             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
01422             "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
01423             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
01424             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
01425             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
01426             "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
01427             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
01428 
01429             MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
01430 
01431             "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
01432             "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
01433             "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
01434             "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
01435             "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
01436             "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
01437             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
01438             "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
01439             "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
01440             "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
01441 
01442             MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
01443 
01444             "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
01445             "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
01446             "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
01447             "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
01448             "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
01449             "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
01450             "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
01451             "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
01452 
01453             MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
01454             MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
01455 
01456             "add                        $8, %%"REG_a"   \n\t"
01457             "cmp                        %4, %%"REG_a"   \n\t"
01458             " jb                        1b              \n\t"
01459             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01460             : "memory", "%"REG_a
01461         );
01462 
01463         ydst += lumStride;
01464         src  += srcStride;
01465 
01466         __asm__ volatile(
01467             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01468             ".p2align                    4              \n\t"
01469             "1:                                         \n\t"
01470             PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
01471             "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
01472             "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
01473             "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
01474             "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
01475             "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
01476             "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
01477             "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
01478             "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
01479             "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
01480             "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
01481 
01482             MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
01483             MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
01484 
01485             "add                        $8, %%"REG_a"   \n\t"
01486             "cmp                        %4, %%"REG_a"   \n\t"
01487             " jb                        1b              \n\t"
01488 
01489             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01490             : "memory", "%"REG_a
01491         );
01492         udst += chromStride;
01493         vdst += chromStride;
01494         ydst += lumStride;
01495         src  += srcStride;
01496     }
01497     __asm__ volatile(EMMS"       \n\t"
01498                      SFENCE"     \n\t"
01499                      :::"memory");
01500 }
01501 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
01502 
01503 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
01504 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
01505 {
01506     int x,y;
01507 
01508     dst[0]= src[0];
01509 
01510     // first line
01511     for (x=0; x<srcWidth-1; x++) {
01512         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
01513         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
01514     }
01515     dst[2*srcWidth-1]= src[srcWidth-1];
01516 
01517     dst+= dstStride;
01518 
01519     for (y=1; y<srcHeight; y++) {
01520         const x86_reg mmxSize= srcWidth&~15;
01521         __asm__ volatile(
01522             "mov           %4, %%"REG_a"            \n\t"
01523             "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
01524             "movq         (%0, %%"REG_a"), %%mm4    \n\t"
01525             "movq                   %%mm4, %%mm2    \n\t"
01526             "psllq                     $8, %%mm4    \n\t"
01527             "pand                   %%mm0, %%mm2    \n\t"
01528             "por                    %%mm2, %%mm4    \n\t"
01529             "movq         (%1, %%"REG_a"), %%mm5    \n\t"
01530             "movq                   %%mm5, %%mm3    \n\t"
01531             "psllq                     $8, %%mm5    \n\t"
01532             "pand                   %%mm0, %%mm3    \n\t"
01533             "por                    %%mm3, %%mm5    \n\t"
01534             "1:                                     \n\t"
01535             "movq         (%0, %%"REG_a"), %%mm0    \n\t"
01536             "movq         (%1, %%"REG_a"), %%mm1    \n\t"
01537             "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
01538             "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
01539             PAVGB"                  %%mm0, %%mm5    \n\t"
01540             PAVGB"                  %%mm0, %%mm3    \n\t"
01541             PAVGB"                  %%mm0, %%mm5    \n\t"
01542             PAVGB"                  %%mm0, %%mm3    \n\t"
01543             PAVGB"                  %%mm1, %%mm4    \n\t"
01544             PAVGB"                  %%mm1, %%mm2    \n\t"
01545             PAVGB"                  %%mm1, %%mm4    \n\t"
01546             PAVGB"                  %%mm1, %%mm2    \n\t"
01547             "movq                   %%mm5, %%mm7    \n\t"
01548             "movq                   %%mm4, %%mm6    \n\t"
01549             "punpcklbw              %%mm3, %%mm5    \n\t"
01550             "punpckhbw              %%mm3, %%mm7    \n\t"
01551             "punpcklbw              %%mm2, %%mm4    \n\t"
01552             "punpckhbw              %%mm2, %%mm6    \n\t"
01553             MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
01554             MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
01555             MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
01556             MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
01557             "add                       $8, %%"REG_a"            \n\t"
01558             "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
01559             "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
01560             " js                       1b                       \n\t"
01561             :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
01562                "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
01563                "g" (-mmxSize)
01564             : "%"REG_a
01565         );
01566 
01567         for (x=mmxSize-1; x<srcWidth-1; x++) {
01568             dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
01569             dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
01570             dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
01571             dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
01572         }
01573         dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
01574         dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
01575 
01576         dst+=dstStride*2;
01577         src+=srcStride;
01578     }
01579 
01580     // last line
01581     dst[0]= src[0];
01582 
01583     for (x=0; x<srcWidth-1; x++) {
01584         dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
01585         dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
01586     }
01587     dst[2*srcWidth-1]= src[srcWidth-1];
01588 
01589     __asm__ volatile(EMMS"       \n\t"
01590                      SFENCE"     \n\t"
01591                      :::"memory");
01592 }
01593 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
01594 
01595 #if !COMPILE_TEMPLATE_AMD3DNOW
01596 
01602 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01603                                       int width, int height,
01604                                       int lumStride, int chromStride, int srcStride)
01605 {
01606     int y;
01607     const x86_reg chromWidth= width>>1;
01608     for (y=0; y<height; y+=2) {
01609         __asm__ volatile(
01610             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01611             "pcmpeqw             %%mm7, %%mm7   \n\t"
01612             "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
01613             ".p2align                4          \n\t"
01614             "1:                                 \n\t"
01615             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
01616             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
01617             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
01618             "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
01619             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
01620             "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
01621             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
01622             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
01623             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
01624             "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
01625             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
01626 
01627             MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
01628 
01629             "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
01630             "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
01631             "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
01632             "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
01633             "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
01634             "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
01635             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
01636             "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
01637             "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
01638             "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
01639 
01640             MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
01641 
01642             "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
01643             "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
01644             "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
01645             "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
01646             "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
01647             "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
01648             "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
01649             "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
01650 
01651             MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
01652             MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
01653 
01654             "add                    $8, %%"REG_a"   \n\t"
01655             "cmp                    %4, %%"REG_a"   \n\t"
01656             " jb                    1b          \n\t"
01657             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01658             : "memory", "%"REG_a
01659         );
01660 
01661         ydst += lumStride;
01662         src  += srcStride;
01663 
01664         __asm__ volatile(
01665             "xor                 %%"REG_a", %%"REG_a"   \n\t"
01666             ".p2align                    4              \n\t"
01667             "1:                                 \n\t"
01668             PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
01669             "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
01670             "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
01671             "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
01672             "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
01673             "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
01674             "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
01675             "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
01676             "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
01677             "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
01678             "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
01679 
01680             MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
01681             MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
01682 
01683             "add                    $8, %%"REG_a"   \n\t"
01684             "cmp                    %4, %%"REG_a"   \n\t"
01685             " jb                    1b          \n\t"
01686 
01687             ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01688             : "memory", "%"REG_a
01689         );
01690         udst += chromStride;
01691         vdst += chromStride;
01692         ydst += lumStride;
01693         src  += srcStride;
01694     }
01695     __asm__ volatile(EMMS"       \n\t"
01696                      SFENCE"     \n\t"
01697                      :::"memory");
01698 }
01699 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
01700 
01708 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01709                                        int width, int height,
01710                                        int lumStride, int chromStride, int srcStride)
01711 {
01712     int y;
01713     const x86_reg chromWidth= width>>1;
01714     for (y=0; y<height-2; y+=2) {
01715         int i;
01716         for (i=0; i<2; i++) {
01717             __asm__ volatile(
01718                 "mov                        %2, %%"REG_a"   \n\t"
01719                 "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
01720                 "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
01721                 "pxor                    %%mm7, %%mm7       \n\t"
01722                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
01723                 ".p2align                    4              \n\t"
01724                 "1:                                         \n\t"
01725                 PREFETCH"    64(%0, %%"REG_d")              \n\t"
01726                 "movd          (%0, %%"REG_d"), %%mm0       \n\t"
01727                 "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
01728                 "punpcklbw               %%mm7, %%mm0       \n\t"
01729                 "punpcklbw               %%mm7, %%mm1       \n\t"
01730                 "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
01731                 "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
01732                 "punpcklbw               %%mm7, %%mm2       \n\t"
01733                 "punpcklbw               %%mm7, %%mm3       \n\t"
01734                 "pmaddwd                 %%mm6, %%mm0       \n\t"
01735                 "pmaddwd                 %%mm6, %%mm1       \n\t"
01736                 "pmaddwd                 %%mm6, %%mm2       \n\t"
01737                 "pmaddwd                 %%mm6, %%mm3       \n\t"
01738 #ifndef FAST_BGR2YV12
01739                 "psrad                      $8, %%mm0       \n\t"
01740                 "psrad                      $8, %%mm1       \n\t"
01741                 "psrad                      $8, %%mm2       \n\t"
01742                 "psrad                      $8, %%mm3       \n\t"
01743 #endif
01744                 "packssdw                %%mm1, %%mm0       \n\t"
01745                 "packssdw                %%mm3, %%mm2       \n\t"
01746                 "pmaddwd                 %%mm5, %%mm0       \n\t"
01747                 "pmaddwd                 %%mm5, %%mm2       \n\t"
01748                 "packssdw                %%mm2, %%mm0       \n\t"
01749                 "psraw                      $7, %%mm0       \n\t"
01750 
01751                 "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
01752                 "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
01753                 "punpcklbw               %%mm7, %%mm4       \n\t"
01754                 "punpcklbw               %%mm7, %%mm1       \n\t"
01755                 "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
01756                 "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
01757                 "punpcklbw               %%mm7, %%mm2       \n\t"
01758                 "punpcklbw               %%mm7, %%mm3       \n\t"
01759                 "pmaddwd                 %%mm6, %%mm4       \n\t"
01760                 "pmaddwd                 %%mm6, %%mm1       \n\t"
01761                 "pmaddwd                 %%mm6, %%mm2       \n\t"
01762                 "pmaddwd                 %%mm6, %%mm3       \n\t"
01763 #ifndef FAST_BGR2YV12
01764                 "psrad                      $8, %%mm4       \n\t"
01765                 "psrad                      $8, %%mm1       \n\t"
01766                 "psrad                      $8, %%mm2       \n\t"
01767                 "psrad                      $8, %%mm3       \n\t"
01768 #endif
01769                 "packssdw                %%mm1, %%mm4       \n\t"
01770                 "packssdw                %%mm3, %%mm2       \n\t"
01771                 "pmaddwd                 %%mm5, %%mm4       \n\t"
01772                 "pmaddwd                 %%mm5, %%mm2       \n\t"
01773                 "add                       $24, %%"REG_d"   \n\t"
01774                 "packssdw                %%mm2, %%mm4       \n\t"
01775                 "psraw                      $7, %%mm4       \n\t"
01776 
01777                 "packuswb                %%mm4, %%mm0       \n\t"
01778                 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
01779 
01780                 MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
01781                 "add                        $8,      %%"REG_a"  \n\t"
01782                 " js                        1b                  \n\t"
01783                 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
01784                 : "%"REG_a, "%"REG_d
01785             );
01786             ydst += lumStride;
01787             src  += srcStride;
01788         }
01789         src -= srcStride*2;
01790         __asm__ volatile(
01791             "mov                        %4, %%"REG_a"   \n\t"
01792             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
01793             "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
01794             "pxor                    %%mm7, %%mm7       \n\t"
01795             "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
01796             "add                 %%"REG_d", %%"REG_d"   \n\t"
01797             ".p2align                    4              \n\t"
01798             "1:                                         \n\t"
01799             PREFETCH"    64(%0, %%"REG_d")              \n\t"
01800             PREFETCH"    64(%1, %%"REG_d")              \n\t"
01801 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
01802             "movq          (%0, %%"REG_d"), %%mm0       \n\t"
01803             "movq          (%1, %%"REG_d"), %%mm1       \n\t"
01804             "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
01805             "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
01806             PAVGB"                   %%mm1, %%mm0       \n\t"
01807             PAVGB"                   %%mm3, %%mm2       \n\t"
01808             "movq                    %%mm0, %%mm1       \n\t"
01809             "movq                    %%mm2, %%mm3       \n\t"
01810             "psrlq                     $24, %%mm0       \n\t"
01811             "psrlq                     $24, %%mm2       \n\t"
01812             PAVGB"                   %%mm1, %%mm0       \n\t"
01813             PAVGB"                   %%mm3, %%mm2       \n\t"
01814             "punpcklbw               %%mm7, %%mm0       \n\t"
01815             "punpcklbw               %%mm7, %%mm2       \n\t"
01816 #else
01817             "movd          (%0, %%"REG_d"), %%mm0       \n\t"
01818             "movd          (%1, %%"REG_d"), %%mm1       \n\t"
01819             "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
01820             "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
01821             "punpcklbw               %%mm7, %%mm0       \n\t"
01822             "punpcklbw               %%mm7, %%mm1       \n\t"
01823             "punpcklbw               %%mm7, %%mm2       \n\t"
01824             "punpcklbw               %%mm7, %%mm3       \n\t"
01825             "paddw                   %%mm1, %%mm0       \n\t"
01826             "paddw                   %%mm3, %%mm2       \n\t"
01827             "paddw                   %%mm2, %%mm0       \n\t"
01828             "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
01829             "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
01830             "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
01831             "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
01832             "punpcklbw               %%mm7, %%mm4       \n\t"
01833             "punpcklbw               %%mm7, %%mm1       \n\t"
01834             "punpcklbw               %%mm7, %%mm2       \n\t"
01835             "punpcklbw               %%mm7, %%mm3       \n\t"
01836             "paddw                   %%mm1, %%mm4       \n\t"
01837             "paddw                   %%mm3, %%mm2       \n\t"
01838             "paddw                   %%mm4, %%mm2       \n\t"
01839             "psrlw                      $2, %%mm0       \n\t"
01840             "psrlw                      $2, %%mm2       \n\t"
01841 #endif
01842             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
01843             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
01844 
01845             "pmaddwd                 %%mm0, %%mm1       \n\t"
01846             "pmaddwd                 %%mm2, %%mm3       \n\t"
01847             "pmaddwd                 %%mm6, %%mm0       \n\t"
01848             "pmaddwd                 %%mm6, %%mm2       \n\t"
01849 #ifndef FAST_BGR2YV12
01850             "psrad                      $8, %%mm0       \n\t"
01851             "psrad                      $8, %%mm1       \n\t"
01852             "psrad                      $8, %%mm2       \n\t"
01853             "psrad                      $8, %%mm3       \n\t"
01854 #endif
01855             "packssdw                %%mm2, %%mm0       \n\t"
01856             "packssdw                %%mm3, %%mm1       \n\t"
01857             "pmaddwd                 %%mm5, %%mm0       \n\t"
01858             "pmaddwd                 %%mm5, %%mm1       \n\t"
01859             "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
01860             "psraw                      $7, %%mm0       \n\t"
01861 
01862 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
01863             "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
01864             "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
01865             "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
01866             "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
01867             PAVGB"                   %%mm1, %%mm4       \n\t"
01868             PAVGB"                   %%mm3, %%mm2       \n\t"
01869             "movq                    %%mm4, %%mm1       \n\t"
01870             "movq                    %%mm2, %%mm3       \n\t"
01871             "psrlq                     $24, %%mm4       \n\t"
01872             "psrlq                     $24, %%mm2       \n\t"
01873             PAVGB"                   %%mm1, %%mm4       \n\t"
01874             PAVGB"                   %%mm3, %%mm2       \n\t"
01875             "punpcklbw               %%mm7, %%mm4       \n\t"
01876             "punpcklbw               %%mm7, %%mm2       \n\t"
01877 #else
01878             "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
01879             "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
01880             "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
01881             "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
01882             "punpcklbw               %%mm7, %%mm4       \n\t"
01883             "punpcklbw               %%mm7, %%mm1       \n\t"
01884             "punpcklbw               %%mm7, %%mm2       \n\t"
01885             "punpcklbw               %%mm7, %%mm3       \n\t"
01886             "paddw                   %%mm1, %%mm4       \n\t"
01887             "paddw                   %%mm3, %%mm2       \n\t"
01888             "paddw                   %%mm2, %%mm4       \n\t"
01889             "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
01890             "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
01891             "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
01892             "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
01893             "punpcklbw               %%mm7, %%mm5       \n\t"
01894             "punpcklbw               %%mm7, %%mm1       \n\t"
01895             "punpcklbw               %%mm7, %%mm2       \n\t"
01896             "punpcklbw               %%mm7, %%mm3       \n\t"
01897             "paddw                   %%mm1, %%mm5       \n\t"
01898             "paddw                   %%mm3, %%mm2       \n\t"
01899             "paddw                   %%mm5, %%mm2       \n\t"
01900             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
01901             "psrlw                      $2, %%mm4       \n\t"
01902             "psrlw                      $2, %%mm2       \n\t"
01903 #endif
01904             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
01905             "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
01906 
01907             "pmaddwd                 %%mm4, %%mm1       \n\t"
01908             "pmaddwd                 %%mm2, %%mm3       \n\t"
01909             "pmaddwd                 %%mm6, %%mm4       \n\t"
01910             "pmaddwd                 %%mm6, %%mm2       \n\t"
01911 #ifndef FAST_BGR2YV12
01912             "psrad                      $8, %%mm4       \n\t"
01913             "psrad                      $8, %%mm1       \n\t"
01914             "psrad                      $8, %%mm2       \n\t"
01915             "psrad                      $8, %%mm3       \n\t"
01916 #endif
01917             "packssdw                %%mm2, %%mm4       \n\t"
01918             "packssdw                %%mm3, %%mm1       \n\t"
01919             "pmaddwd                 %%mm5, %%mm4       \n\t"
01920             "pmaddwd                 %%mm5, %%mm1       \n\t"
01921             "add                       $24, %%"REG_d"   \n\t"
01922             "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
01923             "psraw                      $7, %%mm4       \n\t"
01924 
01925             "movq                    %%mm0, %%mm1           \n\t"
01926             "punpckldq               %%mm4, %%mm0           \n\t"
01927             "punpckhdq               %%mm4, %%mm1           \n\t"
01928             "packsswb                %%mm1, %%mm0           \n\t"
01929             "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
01930             "movd                    %%mm0, (%2, %%"REG_a") \n\t"
01931             "punpckhdq               %%mm0, %%mm0           \n\t"
01932             "movd                    %%mm0, (%3, %%"REG_a") \n\t"
01933             "add                        $4, %%"REG_a"       \n\t"
01934             " js                        1b                  \n\t"
01935             : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
01936             : "%"REG_a, "%"REG_d
01937         );
01938 
01939         udst += chromStride;
01940         vdst += chromStride;
01941         src  += srcStride*2;
01942     }
01943 
01944     __asm__ volatile(EMMS"       \n\t"
01945                      SFENCE"     \n\t"
01946                      :::"memory");
01947 
01948      rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
01949 }
01950 #endif /* !COMPILE_TEMPLATE_SSE2 */
01951 
01952 #if !COMPILE_TEMPLATE_AMD3DNOW
01953 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
01954                                     int width, int height, int src1Stride,
01955                                     int src2Stride, int dstStride)
01956 {
01957     int h;
01958 
01959     for (h=0; h < height; h++) {
01960         int w;
01961 
01962 #if COMPILE_TEMPLATE_SSE2
01963         __asm__(
01964             "xor              %%"REG_a", %%"REG_a"  \n\t"
01965             "1:                                     \n\t"
01966             PREFETCH" 64(%1, %%"REG_a")             \n\t"
01967             PREFETCH" 64(%2, %%"REG_a")             \n\t"
01968             "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
01969             "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
01970             "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
01971             "punpcklbw           %%xmm2, %%xmm0     \n\t"
01972             "punpckhbw           %%xmm2, %%xmm1     \n\t"
01973             "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
01974             "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
01975             "add                    $16, %%"REG_a"  \n\t"
01976             "cmp                     %3, %%"REG_a"  \n\t"
01977             " jb                     1b             \n\t"
01978             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
01979             : "memory", "%"REG_a""
01980         );
01981 #else
01982         __asm__(
01983             "xor %%"REG_a", %%"REG_a"               \n\t"
01984             "1:                                     \n\t"
01985             PREFETCH" 64(%1, %%"REG_a")             \n\t"
01986             PREFETCH" 64(%2, %%"REG_a")             \n\t"
01987             "movq       (%1, %%"REG_a"), %%mm0      \n\t"
01988             "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
01989             "movq                 %%mm0, %%mm1      \n\t"
01990             "movq                 %%mm2, %%mm3      \n\t"
01991             "movq       (%2, %%"REG_a"), %%mm4      \n\t"
01992             "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
01993             "punpcklbw            %%mm4, %%mm0      \n\t"
01994             "punpckhbw            %%mm4, %%mm1      \n\t"
01995             "punpcklbw            %%mm5, %%mm2      \n\t"
01996             "punpckhbw            %%mm5, %%mm3      \n\t"
01997             MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
01998             MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
01999             MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
02000             MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
02001             "add                    $16, %%"REG_a"  \n\t"
02002             "cmp                     %3, %%"REG_a"  \n\t"
02003             " jb                     1b             \n\t"
02004             ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
02005             : "memory", "%"REG_a
02006         );
02007 #endif
02008         for (w= (width&(~15)); w < width; w++) {
02009             dest[2*w+0] = src1[w];
02010             dest[2*w+1] = src2[w];
02011         }
02012         dest += dstStride;
02013         src1 += src1Stride;
02014         src2 += src2Stride;
02015     }
02016     __asm__(
02017             EMMS"       \n\t"
02018             SFENCE"     \n\t"
02019             ::: "memory"
02020             );
02021 }
02022 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02023 
02024 #if !COMPILE_TEMPLATE_SSE2
02025 #if !COMPILE_TEMPLATE_AMD3DNOW
02026 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
02027                                        uint8_t *dst1, uint8_t *dst2,
02028                                        int width, int height,
02029                                        int srcStride1, int srcStride2,
02030                                        int dstStride1, int dstStride2)
02031 {
02032     x86_reg y;
02033     int x,w,h;
02034     w=width/2; h=height/2;
02035     __asm__ volatile(
02036         PREFETCH" %0    \n\t"
02037         PREFETCH" %1    \n\t"
02038         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
02039     for (y=0;y<h;y++) {
02040         const uint8_t* s1=src1+srcStride1*(y>>1);
02041         uint8_t* d=dst1+dstStride1*y;
02042         x=0;
02043         for (;x<w-31;x+=32) {
02044             __asm__ volatile(
02045                 PREFETCH"   32%1        \n\t"
02046                 "movq         %1, %%mm0 \n\t"
02047                 "movq        8%1, %%mm2 \n\t"
02048                 "movq       16%1, %%mm4 \n\t"
02049                 "movq       24%1, %%mm6 \n\t"
02050                 "movq      %%mm0, %%mm1 \n\t"
02051                 "movq      %%mm2, %%mm3 \n\t"
02052                 "movq      %%mm4, %%mm5 \n\t"
02053                 "movq      %%mm6, %%mm7 \n\t"
02054                 "punpcklbw %%mm0, %%mm0 \n\t"
02055                 "punpckhbw %%mm1, %%mm1 \n\t"
02056                 "punpcklbw %%mm2, %%mm2 \n\t"
02057                 "punpckhbw %%mm3, %%mm3 \n\t"
02058                 "punpcklbw %%mm4, %%mm4 \n\t"
02059                 "punpckhbw %%mm5, %%mm5 \n\t"
02060                 "punpcklbw %%mm6, %%mm6 \n\t"
02061                 "punpckhbw %%mm7, %%mm7 \n\t"
02062                 MOVNTQ"    %%mm0,   %0  \n\t"
02063                 MOVNTQ"    %%mm1,  8%0  \n\t"
02064                 MOVNTQ"    %%mm2, 16%0  \n\t"
02065                 MOVNTQ"    %%mm3, 24%0  \n\t"
02066                 MOVNTQ"    %%mm4, 32%0  \n\t"
02067                 MOVNTQ"    %%mm5, 40%0  \n\t"
02068                 MOVNTQ"    %%mm6, 48%0  \n\t"
02069                 MOVNTQ"    %%mm7, 56%0"
02070                 :"=m"(d[2*x])
02071                 :"m"(s1[x])
02072                 :"memory");
02073         }
02074         for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
02075     }
02076     for (y=0;y<h;y++) {
02077         const uint8_t* s2=src2+srcStride2*(y>>1);
02078         uint8_t* d=dst2+dstStride2*y;
02079         x=0;
02080         for (;x<w-31;x+=32) {
02081             __asm__ volatile(
02082                 PREFETCH"   32%1        \n\t"
02083                 "movq         %1, %%mm0 \n\t"
02084                 "movq        8%1, %%mm2 \n\t"
02085                 "movq       16%1, %%mm4 \n\t"
02086                 "movq       24%1, %%mm6 \n\t"
02087                 "movq      %%mm0, %%mm1 \n\t"
02088                 "movq      %%mm2, %%mm3 \n\t"
02089                 "movq      %%mm4, %%mm5 \n\t"
02090                 "movq      %%mm6, %%mm7 \n\t"
02091                 "punpcklbw %%mm0, %%mm0 \n\t"
02092                 "punpckhbw %%mm1, %%mm1 \n\t"
02093                 "punpcklbw %%mm2, %%mm2 \n\t"
02094                 "punpckhbw %%mm3, %%mm3 \n\t"
02095                 "punpcklbw %%mm4, %%mm4 \n\t"
02096                 "punpckhbw %%mm5, %%mm5 \n\t"
02097                 "punpcklbw %%mm6, %%mm6 \n\t"
02098                 "punpckhbw %%mm7, %%mm7 \n\t"
02099                 MOVNTQ"    %%mm0,   %0  \n\t"
02100                 MOVNTQ"    %%mm1,  8%0  \n\t"
02101                 MOVNTQ"    %%mm2, 16%0  \n\t"
02102                 MOVNTQ"    %%mm3, 24%0  \n\t"
02103                 MOVNTQ"    %%mm4, 32%0  \n\t"
02104                 MOVNTQ"    %%mm5, 40%0  \n\t"
02105                 MOVNTQ"    %%mm6, 48%0  \n\t"
02106                 MOVNTQ"    %%mm7, 56%0"
02107                 :"=m"(d[2*x])
02108                 :"m"(s2[x])
02109                 :"memory");
02110         }
02111         for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
02112     }
02113     __asm__(
02114             EMMS"       \n\t"
02115             SFENCE"     \n\t"
02116             ::: "memory"
02117         );
02118 }
02119 
02120 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
02121                                         uint8_t *dst,
02122                                         int width, int height,
02123                                         int srcStride1, int srcStride2,
02124                                         int srcStride3, int dstStride)
02125 {
02126     x86_reg x;
02127     int y,w,h;
02128     w=width/2; h=height;
02129     for (y=0;y<h;y++) {
02130         const uint8_t* yp=src1+srcStride1*y;
02131         const uint8_t* up=src2+srcStride2*(y>>2);
02132         const uint8_t* vp=src3+srcStride3*(y>>2);
02133         uint8_t* d=dst+dstStride*y;
02134         x=0;
02135         for (;x<w-7;x+=8) {
02136             __asm__ volatile(
02137                 PREFETCH"   32(%1, %0)          \n\t"
02138                 PREFETCH"   32(%2, %0)          \n\t"
02139                 PREFETCH"   32(%3, %0)          \n\t"
02140                 "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
02141                 "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
02142                 "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
02143                 "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
02144                 "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
02145                 "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
02146                 "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
02147                 "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
02148                 "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
02149                 "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
02150 
02151                 "movq            %%mm1, %%mm6   \n\t"
02152                 "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
02153                 "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
02154                 "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
02155                 MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
02156                 MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
02157 
02158                 "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
02159                 "movq     8(%1, %0, 4), %%mm0   \n\t"
02160                 "movq            %%mm0, %%mm3   \n\t"
02161                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
02162                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
02163                 MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
02164                 MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
02165 
02166                 "movq            %%mm4, %%mm6   \n\t"
02167                 "movq    16(%1, %0, 4), %%mm0   \n\t"
02168                 "movq            %%mm0, %%mm3   \n\t"
02169                 "punpcklbw       %%mm5, %%mm4   \n\t"
02170                 "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
02171                 "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
02172                 MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
02173                 MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
02174 
02175                 "punpckhbw       %%mm5, %%mm6   \n\t"
02176                 "movq    24(%1, %0, 4), %%mm0   \n\t"
02177                 "movq            %%mm0, %%mm3   \n\t"
02178                 "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
02179                 "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
02180                 MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
02181                 MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
02182 
02183                 : "+r" (x)
02184                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
02185                 :"memory");
02186         }
02187         for (; x<w; x++) {
02188             const int x2 = x<<2;
02189             d[8*x+0] = yp[x2];
02190             d[8*x+1] = up[x];
02191             d[8*x+2] = yp[x2+1];
02192             d[8*x+3] = vp[x];
02193             d[8*x+4] = yp[x2+2];
02194             d[8*x+5] = up[x];
02195             d[8*x+6] = yp[x2+3];
02196             d[8*x+7] = vp[x];
02197         }
02198     }
02199     __asm__(
02200             EMMS"       \n\t"
02201             SFENCE"     \n\t"
02202             ::: "memory"
02203         );
02204 }
02205 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02206 
02207 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
02208 {
02209     dst +=   count;
02210     src += 2*count;
02211     count= - count;
02212 
02213     if(count <= -16) {
02214         count += 15;
02215         __asm__ volatile(
02216             "pcmpeqw       %%mm7, %%mm7        \n\t"
02217             "psrlw            $8, %%mm7        \n\t"
02218             "1:                                \n\t"
02219             "movq -30(%1, %0, 2), %%mm0        \n\t"
02220             "movq -22(%1, %0, 2), %%mm1        \n\t"
02221             "movq -14(%1, %0, 2), %%mm2        \n\t"
02222             "movq  -6(%1, %0, 2), %%mm3        \n\t"
02223             "pand          %%mm7, %%mm0        \n\t"
02224             "pand          %%mm7, %%mm1        \n\t"
02225             "pand          %%mm7, %%mm2        \n\t"
02226             "pand          %%mm7, %%mm3        \n\t"
02227             "packuswb      %%mm1, %%mm0        \n\t"
02228             "packuswb      %%mm3, %%mm2        \n\t"
02229             MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
02230             MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
02231             "add             $16, %0           \n\t"
02232             " js 1b                            \n\t"
02233             : "+r"(count)
02234             : "r"(src), "r"(dst)
02235         );
02236         count -= 15;
02237     }
02238     while(count<0) {
02239         dst[count]= src[2*count];
02240         count++;
02241     }
02242 }
02243 
02244 #if !COMPILE_TEMPLATE_AMD3DNOW
02245 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02246 {
02247     dst0+=   count;
02248     dst1+=   count;
02249     src += 4*count;
02250     count= - count;
02251     if(count <= -8) {
02252         count += 7;
02253         __asm__ volatile(
02254             "pcmpeqw       %%mm7, %%mm7        \n\t"
02255             "psrlw            $8, %%mm7        \n\t"
02256             "1:                                \n\t"
02257             "movq -28(%1, %0, 4), %%mm0        \n\t"
02258             "movq -20(%1, %0, 4), %%mm1        \n\t"
02259             "movq -12(%1, %0, 4), %%mm2        \n\t"
02260             "movq  -4(%1, %0, 4), %%mm3        \n\t"
02261             "pand          %%mm7, %%mm0        \n\t"
02262             "pand          %%mm7, %%mm1        \n\t"
02263             "pand          %%mm7, %%mm2        \n\t"
02264             "pand          %%mm7, %%mm3        \n\t"
02265             "packuswb      %%mm1, %%mm0        \n\t"
02266             "packuswb      %%mm3, %%mm2        \n\t"
02267             "movq          %%mm0, %%mm1        \n\t"
02268             "movq          %%mm2, %%mm3        \n\t"
02269             "psrlw            $8, %%mm0        \n\t"
02270             "psrlw            $8, %%mm2        \n\t"
02271             "pand          %%mm7, %%mm1        \n\t"
02272             "pand          %%mm7, %%mm3        \n\t"
02273             "packuswb      %%mm2, %%mm0        \n\t"
02274             "packuswb      %%mm3, %%mm1        \n\t"
02275             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
02276             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
02277             "add              $8, %0           \n\t"
02278             " js 1b                            \n\t"
02279             : "+r"(count)
02280             : "r"(src), "r"(dst0), "r"(dst1)
02281         );
02282         count -= 7;
02283     }
02284     while(count<0) {
02285         dst0[count]= src[4*count+0];
02286         dst1[count]= src[4*count+2];
02287         count++;
02288     }
02289 }
02290 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02291 
02292 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02293 {
02294     dst0 +=   count;
02295     dst1 +=   count;
02296     src0 += 4*count;
02297     src1 += 4*count;
02298     count= - count;
02299 #ifdef PAVGB
02300     if(count <= -8) {
02301         count += 7;
02302         __asm__ volatile(
02303             "pcmpeqw        %%mm7, %%mm7        \n\t"
02304             "psrlw             $8, %%mm7        \n\t"
02305             "1:                                \n\t"
02306             "movq  -28(%1, %0, 4), %%mm0        \n\t"
02307             "movq  -20(%1, %0, 4), %%mm1        \n\t"
02308             "movq  -12(%1, %0, 4), %%mm2        \n\t"
02309             "movq   -4(%1, %0, 4), %%mm3        \n\t"
02310             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
02311             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
02312             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
02313             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
02314             "pand           %%mm7, %%mm0        \n\t"
02315             "pand           %%mm7, %%mm1        \n\t"
02316             "pand           %%mm7, %%mm2        \n\t"
02317             "pand           %%mm7, %%mm3        \n\t"
02318             "packuswb       %%mm1, %%mm0        \n\t"
02319             "packuswb       %%mm3, %%mm2        \n\t"
02320             "movq           %%mm0, %%mm1        \n\t"
02321             "movq           %%mm2, %%mm3        \n\t"
02322             "psrlw             $8, %%mm0        \n\t"
02323             "psrlw             $8, %%mm2        \n\t"
02324             "pand           %%mm7, %%mm1        \n\t"
02325             "pand           %%mm7, %%mm3        \n\t"
02326             "packuswb       %%mm2, %%mm0        \n\t"
02327             "packuswb       %%mm3, %%mm1        \n\t"
02328             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
02329             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
02330             "add               $8, %0           \n\t"
02331             " js 1b                            \n\t"
02332             : "+r"(count)
02333             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
02334         );
02335         count -= 7;
02336     }
02337 #endif
02338     while(count<0) {
02339         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
02340         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
02341         count++;
02342     }
02343 }
02344 
02345 #if !COMPILE_TEMPLATE_AMD3DNOW
02346 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02347 {
02348     dst0+=   count;
02349     dst1+=   count;
02350     src += 4*count;
02351     count= - count;
02352     if(count <= -8) {
02353         count += 7;
02354         __asm__ volatile(
02355             "pcmpeqw       %%mm7, %%mm7        \n\t"
02356             "psrlw            $8, %%mm7        \n\t"
02357             "1:                                \n\t"
02358             "movq -28(%1, %0, 4), %%mm0        \n\t"
02359             "movq -20(%1, %0, 4), %%mm1        \n\t"
02360             "movq -12(%1, %0, 4), %%mm2        \n\t"
02361             "movq  -4(%1, %0, 4), %%mm3        \n\t"
02362             "psrlw            $8, %%mm0        \n\t"
02363             "psrlw            $8, %%mm1        \n\t"
02364             "psrlw            $8, %%mm2        \n\t"
02365             "psrlw            $8, %%mm3        \n\t"
02366             "packuswb      %%mm1, %%mm0        \n\t"
02367             "packuswb      %%mm3, %%mm2        \n\t"
02368             "movq          %%mm0, %%mm1        \n\t"
02369             "movq          %%mm2, %%mm3        \n\t"
02370             "psrlw            $8, %%mm0        \n\t"
02371             "psrlw            $8, %%mm2        \n\t"
02372             "pand          %%mm7, %%mm1        \n\t"
02373             "pand          %%mm7, %%mm3        \n\t"
02374             "packuswb      %%mm2, %%mm0        \n\t"
02375             "packuswb      %%mm3, %%mm1        \n\t"
02376             MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
02377             MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
02378             "add              $8, %0           \n\t"
02379             " js 1b                            \n\t"
02380             : "+r"(count)
02381             : "r"(src), "r"(dst0), "r"(dst1)
02382         );
02383         count -= 7;
02384     }
02385     src++;
02386     while(count<0) {
02387         dst0[count]= src[4*count+0];
02388         dst1[count]= src[4*count+2];
02389         count++;
02390     }
02391 }
02392 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02393 
02394 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02395 {
02396     dst0 +=   count;
02397     dst1 +=   count;
02398     src0 += 4*count;
02399     src1 += 4*count;
02400     count= - count;
02401 #ifdef PAVGB
02402     if(count <= -8) {
02403         count += 7;
02404         __asm__ volatile(
02405             "pcmpeqw        %%mm7, %%mm7        \n\t"
02406             "psrlw             $8, %%mm7        \n\t"
02407             "1:                                \n\t"
02408             "movq  -28(%1, %0, 4), %%mm0        \n\t"
02409             "movq  -20(%1, %0, 4), %%mm1        \n\t"
02410             "movq  -12(%1, %0, 4), %%mm2        \n\t"
02411             "movq   -4(%1, %0, 4), %%mm3        \n\t"
02412             PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
02413             PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
02414             PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
02415             PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
02416             "psrlw             $8, %%mm0        \n\t"
02417             "psrlw             $8, %%mm1        \n\t"
02418             "psrlw             $8, %%mm2        \n\t"
02419             "psrlw             $8, %%mm3        \n\t"
02420             "packuswb       %%mm1, %%mm0        \n\t"
02421             "packuswb       %%mm3, %%mm2        \n\t"
02422             "movq           %%mm0, %%mm1        \n\t"
02423             "movq           %%mm2, %%mm3        \n\t"
02424             "psrlw             $8, %%mm0        \n\t"
02425             "psrlw             $8, %%mm2        \n\t"
02426             "pand           %%mm7, %%mm1        \n\t"
02427             "pand           %%mm7, %%mm3        \n\t"
02428             "packuswb       %%mm2, %%mm0        \n\t"
02429             "packuswb       %%mm3, %%mm1        \n\t"
02430             MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
02431             MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
02432             "add               $8, %0           \n\t"
02433             " js 1b                            \n\t"
02434             : "+r"(count)
02435             : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
02436         );
02437         count -= 7;
02438     }
02439 #endif
02440     src0++;
02441     src1++;
02442     while(count<0) {
02443         dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
02444         dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
02445         count++;
02446     }
02447 }
02448 
02449 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02450                                  int width, int height,
02451                                  int lumStride, int chromStride, int srcStride)
02452 {
02453     int y;
02454     const int chromWidth= -((-width)>>1);
02455 
02456     for (y=0; y<height; y++) {
02457         RENAME(extract_even)(src, ydst, width);
02458         if(y&1) {
02459             RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
02460             udst+= chromStride;
02461             vdst+= chromStride;
02462         }
02463 
02464         src += srcStride;
02465         ydst+= lumStride;
02466     }
02467     __asm__(
02468             EMMS"       \n\t"
02469             SFENCE"     \n\t"
02470             ::: "memory"
02471         );
02472 }
02473 
02474 #if !COMPILE_TEMPLATE_AMD3DNOW
02475 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02476                                  int width, int height,
02477                                  int lumStride, int chromStride, int srcStride)
02478 {
02479     int y;
02480     const int chromWidth= -((-width)>>1);
02481 
02482     for (y=0; y<height; y++) {
02483         RENAME(extract_even)(src, ydst, width);
02484         RENAME(extract_odd2)(src, udst, vdst, chromWidth);
02485 
02486         src += srcStride;
02487         ydst+= lumStride;
02488         udst+= chromStride;
02489         vdst+= chromStride;
02490     }
02491     __asm__(
02492             EMMS"       \n\t"
02493             SFENCE"     \n\t"
02494             ::: "memory"
02495         );
02496 }
02497 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02498 
02499 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02500                                  int width, int height,
02501                                  int lumStride, int chromStride, int srcStride)
02502 {
02503     int y;
02504     const int chromWidth= -((-width)>>1);
02505 
02506     for (y=0; y<height; y++) {
02507         RENAME(extract_even)(src+1, ydst, width);
02508         if(y&1) {
02509             RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
02510             udst+= chromStride;
02511             vdst+= chromStride;
02512         }
02513 
02514         src += srcStride;
02515         ydst+= lumStride;
02516     }
02517     __asm__(
02518             EMMS"       \n\t"
02519             SFENCE"     \n\t"
02520             ::: "memory"
02521         );
02522 }
02523 
02524 #if !COMPILE_TEMPLATE_AMD3DNOW
02525 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02526                                  int width, int height,
02527                                  int lumStride, int chromStride, int srcStride)
02528 {
02529     int y;
02530     const int chromWidth= -((-width)>>1);
02531 
02532     for (y=0; y<height; y++) {
02533         RENAME(extract_even)(src+1, ydst, width);
02534         RENAME(extract_even2)(src, udst, vdst, chromWidth);
02535 
02536         src += srcStride;
02537         ydst+= lumStride;
02538         udst+= chromStride;
02539         vdst+= chromStride;
02540     }
02541     __asm__(
02542             EMMS"       \n\t"
02543             SFENCE"     \n\t"
02544             ::: "memory"
02545         );
02546 }
02547 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02548 #endif /* !COMPILE_TEMPLATE_SSE2 */
02549 
02550 static inline void RENAME(rgb2rgb_init)(void)
02551 {
02552 #if !COMPILE_TEMPLATE_SSE2
02553 #if !COMPILE_TEMPLATE_AMD3DNOW
02554     rgb15to16          = RENAME(rgb15to16);
02555     rgb15tobgr24       = RENAME(rgb15tobgr24);
02556     rgb15to32          = RENAME(rgb15to32);
02557     rgb16tobgr24       = RENAME(rgb16tobgr24);
02558     rgb16to32          = RENAME(rgb16to32);
02559     rgb16to15          = RENAME(rgb16to15);
02560     rgb24tobgr16       = RENAME(rgb24tobgr16);
02561     rgb24tobgr15       = RENAME(rgb24tobgr15);
02562     rgb24tobgr32       = RENAME(rgb24tobgr32);
02563     rgb32to16          = RENAME(rgb32to16);
02564     rgb32to15          = RENAME(rgb32to15);
02565     rgb32tobgr24       = RENAME(rgb32tobgr24);
02566     rgb24to15          = RENAME(rgb24to15);
02567     rgb24to16          = RENAME(rgb24to16);
02568     rgb24tobgr24       = RENAME(rgb24tobgr24);
02569     shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
02570     rgb32tobgr16       = RENAME(rgb32tobgr16);
02571     rgb32tobgr15       = RENAME(rgb32tobgr15);
02572     yv12toyuy2         = RENAME(yv12toyuy2);
02573     yv12touyvy         = RENAME(yv12touyvy);
02574     yuv422ptoyuy2      = RENAME(yuv422ptoyuy2);
02575     yuv422ptouyvy      = RENAME(yuv422ptouyvy);
02576     yuy2toyv12         = RENAME(yuy2toyv12);
02577     vu9_to_vu12        = RENAME(vu9_to_vu12);
02578     yvu9_to_yuy2       = RENAME(yvu9_to_yuy2);
02579     uyvytoyuv422       = RENAME(uyvytoyuv422);
02580     yuyvtoyuv422       = RENAME(yuyvtoyuv422);
02581 #endif /* !COMPILE_TEMPLATE_SSE2 */
02582 
02583 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
02584     planar2x           = RENAME(planar2x);
02585 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */
02586     rgb24toyv12        = RENAME(rgb24toyv12);
02587 
02588     yuyvtoyuv420       = RENAME(yuyvtoyuv420);
02589     uyvytoyuv420       = RENAME(uyvytoyuv420);
02590 #endif /* COMPILE_TEMPLATE_SSE2 */
02591 
02592 #if !COMPILE_TEMPLATE_AMD3DNOW
02593     interleaveBytes    = RENAME(interleaveBytes);
02594 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
02595 }