• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libavcodec/ppc/dsputil_ppc.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2002 Brian Foley
00003  * Copyright (c) 2002 Dieter Shirley
00004  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 #include "libavutil/cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "dsputil_altivec.h"
00026 
00027 /* ***** WARNING ***** WARNING ***** WARNING ***** */
00028 /*
00029 clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
00030 cache line size not equal to 32 bytes.
00031 Fortunately all processor used by Apple up to at least the 7450 (aka second
00032 generation G4) use 32 bytes cache line.
00033 This is due to the use of the 'dcbz' instruction. It simply clear to zero a
00034 single cache line, so you need to know the cache line size to use it !
00035 It's absurd, but it's fast...
00036 
00037 update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
00038 size: 128 bytes. Oups.
00039 The semantic of dcbz was changed, it always clear 32 bytes. so the function
00040 below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
00041 which is defined to clear a cache line (as dcbz before). So we still can
00042 distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
00043 
00044 see <http://developer.apple.com/technotes/tn/tn2087.html>
00045 and <http://developer.apple.com/technotes/tn/tn2086.html>
00046 */
00047 static void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
00048 {
00049     register int misal = ((unsigned long)blocks & 0x00000010);
00050     register int i = 0;
00051 #if 1
00052     if (misal) {
00053         ((unsigned long*)blocks)[0] = 0L;
00054         ((unsigned long*)blocks)[1] = 0L;
00055         ((unsigned long*)blocks)[2] = 0L;
00056         ((unsigned long*)blocks)[3] = 0L;
00057         i += 16;
00058     }
00059     for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
00060         __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
00061     }
00062     if (misal) {
00063         ((unsigned long*)blocks)[188] = 0L;
00064         ((unsigned long*)blocks)[189] = 0L;
00065         ((unsigned long*)blocks)[190] = 0L;
00066         ((unsigned long*)blocks)[191] = 0L;
00067         i += 16;
00068     }
00069 #else
00070     memset(blocks, 0, sizeof(DCTELEM)*6*64);
00071 #endif
00072 }
00073 
00074 /* same as above, when dcbzl clear a whole 128B cache line
00075    i.e. the PPC970 aka G5 */
00076 #if HAVE_DCBZL
00077 static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
00078 {
00079     register int misal = ((unsigned long)blocks & 0x0000007f);
00080     register int i = 0;
00081 #if 1
00082     if (misal) {
00083         // we could probably also optimize this case,
00084         // but there's not much point as the machines
00085         // aren't available yet (2003-06-26)
00086         memset(blocks, 0, sizeof(DCTELEM)*6*64);
00087     }
00088     else
00089         for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
00090             __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
00091         }
00092 #else
00093     memset(blocks, 0, sizeof(DCTELEM)*6*64);
00094 #endif
00095 }
00096 #else
00097 static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
00098 {
00099     memset(blocks, 0, sizeof(DCTELEM)*6*64);
00100 }
00101 #endif
00102 
00103 #if HAVE_DCBZL
00104 /* check dcbz report how many bytes are set to 0 by dcbz */
00105 /* update 24/06/2003 : replace dcbz by dcbzl to get
00106    the intended effect (Apple "fixed" dcbz)
00107    unfortunately this cannot be used unless the assembler
00108    knows about dcbzl ... */
00109 static long check_dcbzl_effect(void)
00110 {
00111     register char *fakedata = av_malloc(1024);
00112     register char *fakedata_middle;
00113     register long zero = 0;
00114     register long i = 0;
00115     long count = 0;
00116 
00117     if (!fakedata) {
00118         return 0L;
00119     }
00120 
00121     fakedata_middle = (fakedata + 512);
00122 
00123     memset(fakedata, 0xFF, 1024);
00124 
00125     /* below the constraint "b" seems to mean "Address base register"
00126        in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
00127     __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
00128 
00129     for (i = 0; i < 1024 ; i ++) {
00130         if (fakedata[i] == (char)0)
00131             count++;
00132     }
00133 
00134     av_free(fakedata);
00135 
00136     return count;
00137 }
00138 #else
00139 static long check_dcbzl_effect(void)
00140 {
00141   return 0;
00142 }
00143 #endif
00144 
00145 static void prefetch_ppc(void *mem, int stride, int h)
00146 {
00147     register const uint8_t *p = mem;
00148     do {
00149         __asm__ volatile ("dcbt 0,%0" : : "r" (p));
00150         p+= stride;
00151     } while(--h);
00152 }
00153 
00154 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
00155 {
00156     const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
00157 
00158     // Common optimizations whether AltiVec is available or not
00159     c->prefetch = prefetch_ppc;
00160     if (!high_bit_depth) {
00161     switch (check_dcbzl_effect()) {
00162         case 32:
00163             c->clear_blocks = clear_blocks_dcbz32_ppc;
00164             break;
00165         case 128:
00166             c->clear_blocks = clear_blocks_dcbz128_ppc;
00167             break;
00168         default:
00169             break;
00170     }
00171     }
00172 
00173 #if HAVE_ALTIVEC
00174     if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
00175 
00176     if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
00177         dsputil_init_altivec(c, avctx);
00178         float_init_altivec(c, avctx);
00179         int_init_altivec(c, avctx);
00180         c->gmc1 = gmc1_altivec;
00181 
00182 #if CONFIG_ENCODERS
00183         if (avctx->dct_algo == FF_DCT_AUTO ||
00184             avctx->dct_algo == FF_DCT_ALTIVEC) {
00185             c->fdct = fdct_altivec;
00186         }
00187 #endif //CONFIG_ENCODERS
00188 
00189         if (avctx->lowres==0) {
00190             if ((avctx->idct_algo == FF_IDCT_AUTO) ||
00191                 (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
00192                 c->idct_put = idct_put_altivec;
00193                 c->idct_add = idct_add_altivec;
00194                 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
00195             }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
00196                      avctx->idct_algo==FF_IDCT_VP3){
00197                 c->idct_put = ff_vp3_idct_put_altivec;
00198                 c->idct_add = ff_vp3_idct_add_altivec;
00199                 c->idct     = ff_vp3_idct_altivec;
00200                 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
00201             }
00202         }
00203 
00204     }
00205 #endif /* HAVE_ALTIVEC */
00206 }

Generated on Wed Apr 11 2012 07:31:34 for FFmpeg by  doxygen 1.7.1