libswscale/ppc/yuv2rgb_altivec.c
Go to the documentation of this file.
00001 /*
00002  * AltiVec acceleration for colorspace conversion
00003  *
00004  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 /*
00024 Convert I420 YV12 to RGB in various formats,
00025   it rejects images that are not in 420 formats,
00026   it rejects images that don't have widths of multiples of 16,
00027   it rejects images that don't have heights of multiples of 2.
00028 Reject defers to C simulation code.
00029 
00030 Lots of optimizations to be done here.
00031 
00032 1. Need to fix saturation code. I just couldn't get it to fly with packs
00033    and adds, so we currently use max/min to clip.
00034 
00035 2. The inefficient use of chroma loading needs a bit of brushing up.
00036 
00037 3. Analysis of pipeline stalls needs to be done. Use shark to identify
00038    pipeline stalls.
00039 
00040 
00041 MODIFIED to calculate coeffs from currently selected color space.
00042 MODIFIED core to be a macro where you specify the output format.
00043 ADDED UYVY conversion which is never called due to some thing in swscale.
00044 CORRECTED algorithim selection to be strict on input formats.
00045 ADDED runtime detection of AltiVec.
00046 
00047 ADDED altivec_yuv2packedX vertical scl + RGB converter
00048 
00049 March 27,2004
00050 PERFORMANCE ANALYSIS
00051 
00052 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
00053 used as test.
00054 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
00055 same sequence.
00056 
00057 720 * 480 * 30  ~10MPS
00058 
00059 so we have roughly 10 clocks per pixel. This is too high, something has
00060 to be wrong.
00061 
00062 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
00063 need for vec_min.
00064 
00065 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
00066 the input video frame, it was just decompressed so it probably resides in L1
00067 caches. However, we are creating the output video stream. This needs to use the
00068 DSTST instruction to optimize for the cache. We couple this with the fact that
00069 we are not going to be visiting the input buffer again so we mark it Least
00070 Recently Used. This shaves 25% of the processor cycles off.
00071 
00072 Now memcpy is the largest mips consumer in the system, probably due
00073 to the inefficient X11 stuff.
00074 
00075 GL libraries seem to be very slow on this machine 1.33Ghz PB running
00076 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
00077 a versioning issue, however I have libGL.1.2.dylib for both
00078 machines. (We need to figure this out now.)
00079 
00080 GL2 libraries work now with patch for RGB32.
00081 
00082 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
00083 
00084 Integrated luma prescaling adjustment for saturation/contrast/brightness
00085 adjustment.
00086 */
00087 
00088 #include <stdio.h>
00089 #include <stdlib.h>
00090 #include <string.h>
00091 #include <inttypes.h>
00092 #include <assert.h>
00093 #include "config.h"
00094 #include "libswscale/rgb2rgb.h"
00095 #include "libswscale/swscale.h"
00096 #include "libswscale/swscale_internal.h"
00097 #include "libavutil/cpu.h"
00098 #include "libavutil/pixdesc.h"
00099 #include "yuv2rgb_altivec.h"
00100 
00101 #undef PROFILE_THE_BEAST
00102 #undef INC_SCALING
00103 
00104 typedef unsigned char ubyte;
00105 typedef signed char   sbyte;
00106 
00107 
00108 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
00109    homogeneous vector registers x0,x1,x2 are interleaved with the
00110    following technique:
00111 
00112       o0 = vec_mergeh (x0,x1);
00113       o1 = vec_perm (o0, x2, perm_rgb_0);
00114       o2 = vec_perm (o0, x2, perm_rgb_1);
00115       o3 = vec_mergel (x0,x1);
00116       o4 = vec_perm (o3,o2,perm_rgb_2);
00117       o5 = vec_perm (o3,o2,perm_rgb_3);
00118 
00119   perm_rgb_0:   o0(RG).h v1(B) --> o1*
00120               0   1  2   3   4
00121              rgbr|gbrg|brgb|rgbr
00122              0010 0100 1001 0010
00123              0102 3145 2673 894A
00124 
00125   perm_rgb_1:   o0(RG).h v1(B) --> o2
00126               0   1  2   3   4
00127              gbrg|brgb|bbbb|bbbb
00128              0100 1001 1111 1111
00129              B5CD 6EF7 89AB CDEF
00130 
00131   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
00132               0   1  2   3   4
00133              gbrg|brgb|rgbr|gbrg
00134              1111 1111 0010 0100
00135              89AB CDEF 0182 3945
00136 
00137   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
00138               0   1  2   3   4
00139              brgb|rgbr|gbrg|brgb
00140              1001 0010 0100 1001
00141              a67b 89cA BdCD eEFf
00142 
00143 */
00144 static
00145 const vector unsigned char
00146   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
00147                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
00148   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
00149                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
00150   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
00151                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
00152   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
00153                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
00154 
00155 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
00156 do {                                        \
00157     __typeof__(x0) o0,o2,o3;                \
00158         o0 = vec_mergeh (x0,x1);            \
00159         y0 = vec_perm (o0, x2, perm_rgb_0); \
00160         o2 = vec_perm (o0, x2, perm_rgb_1); \
00161         o3 = vec_mergel (x0,x1);            \
00162         y1 = vec_perm (o3,o2,perm_rgb_2);   \
00163         y2 = vec_perm (o3,o2,perm_rgb_3);   \
00164 } while(0)
00165 
00166 #define vec_mstbgr24(x0,x1,x2,ptr)      \
00167 do {                                    \
00168     __typeof__(x0) _0,_1,_2;            \
00169     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
00170     vec_st (_0, 0, ptr++);              \
00171     vec_st (_1, 0, ptr++);              \
00172     vec_st (_2, 0, ptr++);              \
00173 }  while (0)
00174 
00175 #define vec_mstrgb24(x0,x1,x2,ptr)      \
00176 do {                                    \
00177     __typeof__(x0) _0,_1,_2;            \
00178     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
00179     vec_st (_0, 0, ptr++);              \
00180     vec_st (_1, 0, ptr++);              \
00181     vec_st (_2, 0, ptr++);              \
00182 }  while (0)
00183 
00184 /* pack the pixels in rgb0 format
00185    msb R
00186    lsb 0
00187 */
00188 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
00189 do {                                                                          \
00190     T _0,_1,_2,_3;                                                            \
00191     _0 = vec_mergeh (x0,x1);                                                  \
00192     _1 = vec_mergeh (x2,x3);                                                  \
00193     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
00194     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
00195     vec_st (_2, 0*16, (T *)ptr);                                              \
00196     vec_st (_3, 1*16, (T *)ptr);                                              \
00197     _0 = vec_mergel (x0,x1);                                                  \
00198     _1 = vec_mergel (x2,x3);                                                  \
00199     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
00200     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
00201     vec_st (_2, 2*16, (T *)ptr);                                              \
00202     vec_st (_3, 3*16, (T *)ptr);                                              \
00203     ptr += 4;                                                                 \
00204 }  while (0)
00205 
00206 /*
00207 
00208   | 1     0       1.4021   | | Y |
00209   | 1    -0.3441 -0.7142   |x| Cb|
00210   | 1     1.7718  0        | | Cr|
00211 
00212 
00213   Y:      [-128 127]
00214   Cb/Cr : [-128 127]
00215 
00216   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
00217 
00218 */
00219 
00220 
00221 
00222 
00223 #define vec_unh(x) \
00224     (vector signed short) \
00225         vec_perm(x,(__typeof__(x)){0}, \
00226                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
00227                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
00228 #define vec_unl(x) \
00229     (vector signed short) \
00230         vec_perm(x,(__typeof__(x)){0}, \
00231                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
00232                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
00233 
00234 #define vec_clip_s16(x) \
00235     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
00236                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
00237 
00238 #define vec_packclp(x,y) \
00239     (vector unsigned char)vec_packs \
00240         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
00241          (vector unsigned short)vec_max (y,((vector signed short) {0})))
00242 
00243 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
00244 
00245 
00246 static inline void cvtyuvtoRGB (SwsContext *c,
00247                                 vector signed short Y, vector signed short U, vector signed short V,
00248                                 vector signed short *R, vector signed short *G, vector signed short *B)
00249 {
00250     vector signed   short vx,ux,uvx;
00251 
00252     Y = vec_mradds (Y, c->CY, c->OY);
00253     U  = vec_sub (U,(vector signed short)
00254                     vec_splat((vector signed short){128},0));
00255     V  = vec_sub (V,(vector signed short)
00256                     vec_splat((vector signed short){128},0));
00257 
00258     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
00259     ux = vec_sl (U, c->CSHIFT);
00260     *B = vec_mradds (ux, c->CBU, Y);
00261 
00262     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
00263     vx = vec_sl (V, c->CSHIFT);
00264     *R = vec_mradds (vx, c->CRV, Y);
00265 
00266     // uvx = ((CGU*u) + (CGV*v))>>15;
00267     uvx = vec_mradds (U, c->CGU, Y);
00268     *G  = vec_mradds (V, c->CGV, uvx);
00269 }
00270 
00271 
00272 /*
00273   ------------------------------------------------------------------------------
00274   CS converters
00275   ------------------------------------------------------------------------------
00276 */
00277 
00278 
00279 #define DEFCSP420_CVT(name,out_pixels)                                  \
00280 static int altivec_##name (SwsContext *c,                               \
00281                            const unsigned char **in, int *instrides,    \
00282                            int srcSliceY,        int srcSliceH,         \
00283                            unsigned char **oplanes, int *outstrides)    \
00284 {                                                                       \
00285     int w = c->srcW;                                                    \
00286     int h = srcSliceH;                                                  \
00287     int i,j;                                                            \
00288     int instrides_scl[3];                                               \
00289     vector unsigned char y0,y1;                                         \
00290                                                                         \
00291     vector signed char  u,v;                                            \
00292                                                                         \
00293     vector signed short Y0,Y1,Y2,Y3;                                    \
00294     vector signed short U,V;                                            \
00295     vector signed short vx,ux,uvx;                                      \
00296     vector signed short vx0,ux0,uvx0;                                   \
00297     vector signed short vx1,ux1,uvx1;                                   \
00298     vector signed short R0,G0,B0;                                       \
00299     vector signed short R1,G1,B1;                                       \
00300     vector unsigned char R,G,B;                                         \
00301                                                                         \
00302     const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;            \
00303     vector unsigned char align_perm;                                    \
00304                                                                         \
00305     vector signed short                                                 \
00306         lCY  = c->CY,                                                   \
00307         lOY  = c->OY,                                                   \
00308         lCRV = c->CRV,                                                  \
00309         lCBU = c->CBU,                                                  \
00310         lCGU = c->CGU,                                                  \
00311         lCGV = c->CGV;                                                  \
00312                                                                         \
00313     vector unsigned short lCSHIFT = c->CSHIFT;                          \
00314                                                                         \
00315     const ubyte *y1i   = in[0];                                         \
00316     const ubyte *y2i   = in[0]+instrides[0];                            \
00317     const ubyte *ui    = in[1];                                         \
00318     const ubyte *vi    = in[2];                                         \
00319                                                                         \
00320     vector unsigned char *oute                                          \
00321         = (vector unsigned char *)                                      \
00322             (oplanes[0]+srcSliceY*outstrides[0]);                       \
00323     vector unsigned char *outo                                          \
00324         = (vector unsigned char *)                                      \
00325             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
00326                                                                         \
00327                                                                         \
00328     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
00329     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
00330     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
00331                                                                         \
00332                                                                         \
00333     for (i=0;i<h/2;i++) {                                               \
00334         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
00335         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
00336                                                                         \
00337         for (j=0;j<w/16;j++) {                                          \
00338                                                                         \
00339             y1ivP = (const vector unsigned char *)y1i;                  \
00340             y2ivP = (const vector unsigned char *)y2i;                  \
00341             uivP  = (const vector unsigned char *)ui;                   \
00342             vivP  = (const vector unsigned char *)vi;                   \
00343                                                                         \
00344             align_perm = vec_lvsl (0, y1i);                             \
00345             y0 = (vector unsigned char)                                 \
00346                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
00347                                                                         \
00348             align_perm = vec_lvsl (0, y2i);                             \
00349             y1 = (vector unsigned char)                                 \
00350                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
00351                                                                         \
00352             align_perm = vec_lvsl (0, ui);                              \
00353             u = (vector signed char)                                    \
00354                 vec_perm (uivP[0], uivP[1], align_perm);                \
00355                                                                         \
00356             align_perm = vec_lvsl (0, vi);                              \
00357             v = (vector signed char)                                    \
00358                 vec_perm (vivP[0], vivP[1], align_perm);                \
00359                                                                         \
00360             u  = (vector signed char)                                   \
00361                  vec_sub (u,(vector signed char)                        \
00362                           vec_splat((vector signed char){128},0));      \
00363             v  = (vector signed char)                                   \
00364                  vec_sub (v,(vector signed char)                        \
00365                           vec_splat((vector signed char){128},0));      \
00366                                                                         \
00367             U  = vec_unpackh (u);                                       \
00368             V  = vec_unpackh (v);                                       \
00369                                                                         \
00370                                                                         \
00371             Y0 = vec_unh (y0);                                          \
00372             Y1 = vec_unl (y0);                                          \
00373             Y2 = vec_unh (y1);                                          \
00374             Y3 = vec_unl (y1);                                          \
00375                                                                         \
00376             Y0 = vec_mradds (Y0, lCY, lOY);                             \
00377             Y1 = vec_mradds (Y1, lCY, lOY);                             \
00378             Y2 = vec_mradds (Y2, lCY, lOY);                             \
00379             Y3 = vec_mradds (Y3, lCY, lOY);                             \
00380                                                                         \
00381             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
00382             ux = vec_sl (U, lCSHIFT);                                   \
00383             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
00384             ux0  = vec_mergeh (ux,ux);                                  \
00385             ux1  = vec_mergel (ux,ux);                                  \
00386                                                                         \
00387             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
00388             vx = vec_sl (V, lCSHIFT);                                   \
00389             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
00390             vx0  = vec_mergeh (vx,vx);                                  \
00391             vx1  = vec_mergel (vx,vx);                                  \
00392                                                                         \
00393             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
00394             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
00395             uvx = vec_mradds (V, lCGV, uvx);                            \
00396             uvx0 = vec_mergeh (uvx,uvx);                                \
00397             uvx1 = vec_mergel (uvx,uvx);                                \
00398                                                                         \
00399             R0 = vec_add (Y0,vx0);                                      \
00400             G0 = vec_add (Y0,uvx0);                                     \
00401             B0 = vec_add (Y0,ux0);                                      \
00402             R1 = vec_add (Y1,vx1);                                      \
00403             G1 = vec_add (Y1,uvx1);                                     \
00404             B1 = vec_add (Y1,ux1);                                      \
00405                                                                         \
00406             R  = vec_packclp (R0,R1);                                   \
00407             G  = vec_packclp (G0,G1);                                   \
00408             B  = vec_packclp (B0,B1);                                   \
00409                                                                         \
00410             out_pixels(R,G,B,oute);                                     \
00411                                                                         \
00412             R0 = vec_add (Y2,vx0);                                      \
00413             G0 = vec_add (Y2,uvx0);                                     \
00414             B0 = vec_add (Y2,ux0);                                      \
00415             R1 = vec_add (Y3,vx1);                                      \
00416             G1 = vec_add (Y3,uvx1);                                     \
00417             B1 = vec_add (Y3,ux1);                                      \
00418             R  = vec_packclp (R0,R1);                                   \
00419             G  = vec_packclp (G0,G1);                                   \
00420             B  = vec_packclp (B0,B1);                                   \
00421                                                                         \
00422                                                                         \
00423             out_pixels(R,G,B,outo);                                     \
00424                                                                         \
00425             y1i  += 16;                                                 \
00426             y2i  += 16;                                                 \
00427             ui   += 8;                                                  \
00428             vi   += 8;                                                  \
00429                                                                         \
00430         }                                                               \
00431                                                                         \
00432         outo  += (outstrides[0])>>4;                                    \
00433         oute  += (outstrides[0])>>4;                                    \
00434                                                                         \
00435         ui    += instrides_scl[1];                                      \
00436         vi    += instrides_scl[2];                                      \
00437         y1i   += instrides_scl[0];                                      \
00438         y2i   += instrides_scl[0];                                      \
00439     }                                                                   \
00440     return srcSliceH;                                                   \
00441 }
00442 
00443 
00444 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
00445 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
00446 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
00447 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
00448 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
00449 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
00450 
00451 DEFCSP420_CVT (yuv2_abgr, out_abgr)
00452 DEFCSP420_CVT (yuv2_bgra, out_bgra)
00453 DEFCSP420_CVT (yuv2_rgba, out_rgba)
00454 DEFCSP420_CVT (yuv2_argb, out_argb)
00455 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
00456 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
00457 
00458 
00459 // uyvy|uyvy|uyvy|uyvy
00460 // 0123 4567 89ab cdef
00461 static
00462 const vector unsigned char
00463     demux_u = {0x10,0x00,0x10,0x00,
00464                0x10,0x04,0x10,0x04,
00465                0x10,0x08,0x10,0x08,
00466                0x10,0x0c,0x10,0x0c},
00467     demux_v = {0x10,0x02,0x10,0x02,
00468                0x10,0x06,0x10,0x06,
00469                0x10,0x0A,0x10,0x0A,
00470                0x10,0x0E,0x10,0x0E},
00471     demux_y = {0x10,0x01,0x10,0x03,
00472                0x10,0x05,0x10,0x07,
00473                0x10,0x09,0x10,0x0B,
00474                0x10,0x0D,0x10,0x0F};
00475 
00476 /*
00477   this is so I can play live CCIR raw video
00478 */
00479 static int altivec_uyvy_rgb32 (SwsContext *c,
00480                                const unsigned char **in, int *instrides,
00481                                int srcSliceY,        int srcSliceH,
00482                                unsigned char **oplanes, int *outstrides)
00483 {
00484     int w = c->srcW;
00485     int h = srcSliceH;
00486     int i,j;
00487     vector unsigned char uyvy;
00488     vector signed   short Y,U,V;
00489     vector signed   short R0,G0,B0,R1,G1,B1;
00490     vector unsigned char  R,G,B;
00491     vector unsigned char *out;
00492     const ubyte *img;
00493 
00494     img = in[0];
00495     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
00496 
00497     for (i=0;i<h;i++) {
00498         for (j=0;j<w/16;j++) {
00499             uyvy = vec_ld (0, img);
00500             U = (vector signed short)
00501                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
00502 
00503             V = (vector signed short)
00504                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
00505 
00506             Y = (vector signed short)
00507                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
00508 
00509             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
00510 
00511             uyvy = vec_ld (16, img);
00512             U = (vector signed short)
00513                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
00514 
00515             V = (vector signed short)
00516                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
00517 
00518             Y = (vector signed short)
00519                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
00520 
00521             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
00522 
00523             R  = vec_packclp (R0,R1);
00524             G  = vec_packclp (G0,G1);
00525             B  = vec_packclp (B0,B1);
00526 
00527             //      vec_mstbgr24 (R,G,B, out);
00528             out_rgba (R,G,B,out);
00529 
00530             img += 32;
00531         }
00532     }
00533     return srcSliceH;
00534 }
00535 
00536 
00537 
00538 /* Ok currently the acceleration routine only supports
00539    inputs of widths a multiple of 16
00540    and heights a multiple 2
00541 
00542    So we just fall back to the C codes for this.
00543 */
00544 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
00545 {
00546     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
00547         return NULL;
00548 
00549     /*
00550       and this seems not to matter too much I tried a bunch of
00551       videos with abnormal widths and MPlayer crashes elsewhere.
00552       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
00553       boom with X11 bad match.
00554 
00555     */
00556     if ((c->srcW & 0xf) != 0)    return NULL;
00557 
00558     switch (c->srcFormat) {
00559     case PIX_FMT_YUV410P:
00560     case PIX_FMT_YUV420P:
00561     /*case IMGFMT_CLPL:        ??? */
00562     case PIX_FMT_GRAY8:
00563     case PIX_FMT_NV12:
00564     case PIX_FMT_NV21:
00565         if ((c->srcH & 0x1) != 0)
00566             return NULL;
00567 
00568         switch(c->dstFormat) {
00569         case PIX_FMT_RGB24:
00570             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
00571             return altivec_yuv2_rgb24;
00572         case PIX_FMT_BGR24:
00573             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
00574             return altivec_yuv2_bgr24;
00575         case PIX_FMT_ARGB:
00576             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
00577             return altivec_yuv2_argb;
00578         case PIX_FMT_ABGR:
00579             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
00580             return altivec_yuv2_abgr;
00581         case PIX_FMT_RGBA:
00582             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
00583             return altivec_yuv2_rgba;
00584         case PIX_FMT_BGRA:
00585             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
00586             return altivec_yuv2_bgra;
00587         default: return NULL;
00588         }
00589         break;
00590 
00591     case PIX_FMT_UYVY422:
00592         switch(c->dstFormat) {
00593         case PIX_FMT_BGR32:
00594             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
00595             return altivec_uyvy_rgb32;
00596         default: return NULL;
00597         }
00598         break;
00599 
00600     }
00601     return NULL;
00602 }
00603 
00604 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
00605 {
00606     union {
00607         DECLARE_ALIGNED(16, signed short, tmp)[8];
00608         vector signed short vec;
00609     } buf;
00610 
00611     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
00612     buf.tmp[1] =  -256*brightness;                                      //oy
00613     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
00614     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
00615     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
00616     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
00617 
00618 
00619     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
00620     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
00621     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
00622     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
00623     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
00624     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
00625     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
00626     return;
00627 }
00628 
00629 
00630 static av_always_inline void
00631 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
00632                        const int16_t **lumSrc, int lumFilterSize,
00633                        const int16_t *chrFilter, const int16_t **chrUSrc,
00634                        const int16_t **chrVSrc, int chrFilterSize,
00635                        const int16_t **alpSrc, uint8_t *dest,
00636                        int dstW, int dstY, enum PixelFormat target)
00637 {
00638     int i,j;
00639     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
00640     vector signed short R0,G0,B0,R1,G1,B1;
00641 
00642     vector unsigned char R,G,B;
00643     vector unsigned char *out,*nout;
00644 
00645     vector signed short   RND = vec_splat_s16(1<<3);
00646     vector unsigned short SCL = vec_splat_u16(4);
00647     DECLARE_ALIGNED(16, unsigned int, scratch)[16];
00648 
00649     vector signed short *YCoeffs, *CCoeffs;
00650 
00651     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
00652     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
00653 
00654     out = (vector unsigned char *)dest;
00655 
00656     for (i=0; i<dstW; i+=16) {
00657         Y0 = RND;
00658         Y1 = RND;
00659         /* extract 16 coeffs from lumSrc */
00660         for (j=0; j<lumFilterSize; j++) {
00661             X0 = vec_ld (0,  &lumSrc[j][i]);
00662             X1 = vec_ld (16, &lumSrc[j][i]);
00663             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00664             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00665         }
00666 
00667         U = RND;
00668         V = RND;
00669         /* extract 8 coeffs from U,V */
00670         for (j=0; j<chrFilterSize; j++) {
00671             X  = vec_ld (0, &chrUSrc[j][i/2]);
00672             U  = vec_mradds (X, CCoeffs[j], U);
00673             X  = vec_ld (0, &chrVSrc[j][i/2]);
00674             V  = vec_mradds (X, CCoeffs[j], V);
00675         }
00676 
00677         /* scale and clip signals */
00678         Y0 = vec_sra (Y0, SCL);
00679         Y1 = vec_sra (Y1, SCL);
00680         U  = vec_sra (U,  SCL);
00681         V  = vec_sra (V,  SCL);
00682 
00683         Y0 = vec_clip_s16 (Y0);
00684         Y1 = vec_clip_s16 (Y1);
00685         U  = vec_clip_s16 (U);
00686         V  = vec_clip_s16 (V);
00687 
00688         /* now we have
00689           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00690           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
00691 
00692           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00693           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00694           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00695         */
00696 
00697         U0 = vec_mergeh (U,U);
00698         V0 = vec_mergeh (V,V);
00699 
00700         U1 = vec_mergel (U,U);
00701         V1 = vec_mergel (V,V);
00702 
00703         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00704         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00705 
00706         R  = vec_packclp (R0,R1);
00707         G  = vec_packclp (G0,G1);
00708         B  = vec_packclp (B0,B1);
00709 
00710         switch(target) {
00711         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
00712         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
00713         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
00714         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
00715         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
00716         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
00717         default:
00718             {
00719                 /* If this is reached, the caller should have called yuv2packedXinC
00720                    instead. */
00721                 static int printed_error_message;
00722                 if (!printed_error_message) {
00723                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
00724                            av_get_pix_fmt_name(c->dstFormat));
00725                     printed_error_message=1;
00726                 }
00727                 return;
00728             }
00729         }
00730     }
00731 
00732     if (i < dstW) {
00733         i -= 16;
00734 
00735         Y0 = RND;
00736         Y1 = RND;
00737         /* extract 16 coeffs from lumSrc */
00738         for (j=0; j<lumFilterSize; j++) {
00739             X0 = vec_ld (0,  &lumSrc[j][i]);
00740             X1 = vec_ld (16, &lumSrc[j][i]);
00741             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00742             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00743         }
00744 
00745         U = RND;
00746         V = RND;
00747         /* extract 8 coeffs from U,V */
00748         for (j=0; j<chrFilterSize; j++) {
00749             X  = vec_ld (0, &chrUSrc[j][i/2]);
00750             U  = vec_mradds (X, CCoeffs[j], U);
00751             X  = vec_ld (0, &chrVSrc[j][i/2]);
00752             V  = vec_mradds (X, CCoeffs[j], V);
00753         }
00754 
00755         /* scale and clip signals */
00756         Y0 = vec_sra (Y0, SCL);
00757         Y1 = vec_sra (Y1, SCL);
00758         U  = vec_sra (U,  SCL);
00759         V  = vec_sra (V,  SCL);
00760 
00761         Y0 = vec_clip_s16 (Y0);
00762         Y1 = vec_clip_s16 (Y1);
00763         U  = vec_clip_s16 (U);
00764         V  = vec_clip_s16 (V);
00765 
00766         /* now we have
00767            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00768            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
00769 
00770            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00771            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00772            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00773         */
00774 
00775         U0 = vec_mergeh (U,U);
00776         V0 = vec_mergeh (V,V);
00777 
00778         U1 = vec_mergel (U,U);
00779         V1 = vec_mergel (V,V);
00780 
00781         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00782         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00783 
00784         R  = vec_packclp (R0,R1);
00785         G  = vec_packclp (G0,G1);
00786         B  = vec_packclp (B0,B1);
00787 
00788         nout = (vector unsigned char *)scratch;
00789         switch(target) {
00790         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
00791         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
00792         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
00793         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
00794         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
00795         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
00796         default:
00797             /* Unreachable, I think. */
00798             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
00799                    av_get_pix_fmt_name(c->dstFormat));
00800             return;
00801         }
00802 
00803         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
00804     }
00805 
00806 }
00807 
00808 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
00809 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
00810                             const int16_t **lumSrc, int lumFilterSize, \
00811                             const int16_t *chrFilter, const int16_t **chrUSrc, \
00812                             const int16_t **chrVSrc, int chrFilterSize, \
00813                             const int16_t **alpSrc, uint8_t *dest, \
00814                             int dstW, int dstY) \
00815 { \
00816     ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
00817                            chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
00818                            alpSrc, dest, dstW, dstY, pixfmt); \
00819 }
00820 
00821 YUV2PACKEDX_WRAPPER(abgr,  PIX_FMT_ABGR);
00822 YUV2PACKEDX_WRAPPER(bgra,  PIX_FMT_BGRA);
00823 YUV2PACKEDX_WRAPPER(argb,  PIX_FMT_ARGB);
00824 YUV2PACKEDX_WRAPPER(rgba,  PIX_FMT_RGBA);
00825 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
00826 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);