GPU: Add AltiVec-accelerated functions for 2D layer compositing. (For PowerPC CPUs only.)

- This improves GPU performance by up to 25% on a PowerPC 970MP.
This commit is contained in:
rogerman
2025-07-25 17:37:55 -07:00
parent 2c4ff5fea6
commit e52e3963d0
6 changed files with 3512 additions and 0 deletions

View File

@@ -59,6 +59,9 @@
#elif defined(ENABLE_NEON_A64)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#elif defined(ENABLE_ALTIVEC)
#define USEVECTORSIZE_128
#define VECTORSIZE 16
#endif
#if defined(USEVECTORSIZE_512) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_128)

View File

@@ -751,6 +751,8 @@ static FORCEINLINE void CopyLinesForVerticalCount(void *__restrict dstLineHead,
#include "GPU_Operations_SSE2.cpp"
#elif defined(ENABLE_NEON_A64)
#include "GPU_Operations_NEON.cpp"
#elif defined(ENABLE_ALTIVEC)
#include "GPU_Operations_AltiVec.cpp"
#else
template <bool NEEDENDIANSWAP, size_t ELEMENTSIZE>

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,122 @@
/*
Copyright (C) 2025 DeSmuME team
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with the this software. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GPU_OPERATIONS_ALTIVEC_H
#define GPU_OPERATIONS_ALTIVEC_H
#include "GPU_Operations.h"
#ifndef ENABLE_ALTIVEC
#warning This header requires PowerPC AltiVec support.
#else
class ColorOperation_AltiVec
{
public:
ColorOperation_AltiVec() {};
FORCEINLINE v128u16 blend(const v128u16 &colA, const v128u16 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
template<NDSColorFormat COLORFORMAT, bool USECONSTANTBLENDVALUESHINT> FORCEINLINE v128u32 blend(const v128u32 &colA, const v128u32 &colB, const v128u16 &blendEVA, const v128u16 &blendEVB) const;
FORCEINLINE v128u16 blend3D(const v128u32 &colA_Lo, const v128u32 &colA_Hi, const v128u16 &colB) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 blend3D(const v128u32 &colA, const v128u32 &colB) const;
FORCEINLINE v128u16 increase(const v128u16 &col, const v128u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 increase(const v128u32 &col, const v128u16 &blendEVY) const;
FORCEINLINE v128u16 decrease(const v128u16 &col, const v128u16 &blendEVY) const;
template<NDSColorFormat COLORFORMAT> FORCEINLINE v128u32 decrease(const v128u32 &col, const v128u16 &blendEVY) const;
};
class PixelOperation_AltiVec
{
protected:
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy16(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copy32(GPUEngineCompositorInfo &compInfo, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, bool ISDEBUGRENDER> FORCEINLINE void _copyMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUp32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessUpMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown16(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDown32(GPUEngineCompositorInfo &compInfo, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask16(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u16 &src1, const v128u16 &src0) const;
template<NDSColorFormat OUTPUTFORMAT> FORCEINLINE void _brightnessDownMask32(GPUEngineCompositorInfo &compInfo, const v128u8 &passMask8, const v128u16 &evy16, const v128u8 &srcLayerID, const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask16(GPUEngineCompositorInfo &compInfo,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u16 &src1, const v128u16 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const v128u8 &enableColorEffectMask,
const v128u8 &spriteAlpha,
const v128u8 &spriteMode) const;
template<NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE>
FORCEINLINE void _unknownEffectMask32(GPUEngineCompositorInfo &compInfo,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const v128u8 &enableColorEffectMask,
const v128u8 &spriteAlpha,
const v128u8 &spriteMode) const;
public:
PixelOperation_AltiVec() {};
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite16(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u16 &src1, const v128u16 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
template <GPUCompositorMode COMPOSITORMODE, NDSColorFormat OUTPUTFORMAT, GPULayerType LAYERTYPE, bool WILLPERFORMWINDOWTEST>
FORCEINLINE void Composite32(GPUEngineCompositorInfo &compInfo,
const bool didAllPixelsPass,
const v128u8 &passMask8,
const v128u16 &evy16,
const v128u8 &srcLayerID,
const v128u32 &src3, const v128u32 &src2, const v128u32 &src1, const v128u32 &src0,
const v128u8 &srcEffectEnableMask,
const v128u8 &dstBlendEnableMaskLUT,
const u8 *__restrict enableColorEffectPtr,
const u8 *__restrict sprAlphaPtr,
const u8 *__restrict sprModePtr) const;
};
#endif // ENABLE_ALTIVEC
#endif // GPU_OPERATIONS_ALTIVEC_H

View File

@@ -4423,6 +4423,8 @@
ABDDF7C41898F024007583C1 /* Icon_DisplayToggle_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_DisplayToggle_420x420.png; path = images/Icon_DisplayToggle_420x420.png; sourceTree = "<group>"; };
ABDDF7C71898F032007583C1 /* Icon_FrameAdvance_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_FrameAdvance_420x420.png; path = images/Icon_FrameAdvance_420x420.png; sourceTree = "<group>"; };
ABDDF7C81898F032007583C1 /* Icon_FrameJump_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_FrameJump_420x420.png; path = images/Icon_FrameJump_420x420.png; sourceTree = "<group>"; };
ABDE648E2E21068500C03E0B /* GPU_Operations_AltiVec.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = GPU_Operations_AltiVec.h; sourceTree = "<group>"; };
ABDE648F2E21068500C03E0B /* GPU_Operations_AltiVec.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = GPU_Operations_AltiVec.cpp; sourceTree = "<group>"; };
ABE5DFE3143FB1DA00835AD8 /* cocoa_videofilter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cocoa_videofilter.h; sourceTree = "<group>"; };
ABE5DFE4143FB1DA00835AD8 /* cocoa_videofilter.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = cocoa_videofilter.mm; sourceTree = "<group>"; };
ABE670251415DE6C00E8E4C9 /* tinystr.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tinystr.cpp; sourceTree = "<group>"; };
@@ -5688,6 +5690,7 @@
AB1D4BB426E6F8D700A9AE42 /* GPU_Operations_SSE2.cpp */,
AB1D4BAF26E6F8D700A9AE42 /* GPU_Operations_AVX2.cpp */,
ABD86C842D83E20500505422 /* GPU_Operations_NEON.cpp */,
ABDE648F2E21068500C03E0B /* GPU_Operations_AltiVec.cpp */,
ABD1FEB81345AC8400AF11D1 /* lua-engine.cpp */,
ABD1FEB91345AC8400AF11D1 /* matrix.cpp */,
ABD1FEBA1345AC8400AF11D1 /* mc.cpp */,
@@ -5733,6 +5736,7 @@
AB1D4BB226E6F8D700A9AE42 /* GPU_Operations_SSE2.h */,
AB1D4BB026E6F8D700A9AE42 /* GPU_Operations_AVX2.h */,
ABD86C832D83E20500505422 /* GPU_Operations_NEON.h */,
ABDE648E2E21068500C03E0B /* GPU_Operations_AltiVec.h */,
AB796CA215CDCB6B00C59155 /* instruction_attributes.h */,
AB796CA315CDCB6B00C59155 /* instructions.h */,
ABD1FE841345AC8400AF11D1 /* lua-engine.h */,

View File

@@ -1918,6 +1918,8 @@
AB0F29A314BE7213009ABC6F /* Icon_RotateCW_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_RotateCW_420x420.png; path = images/Icon_RotateCW_420x420.png; sourceTree = "<group>"; };
AB0F29A414BE7213009ABC6F /* Icon_ShowHUD_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_ShowHUD_420x420.png; path = images/Icon_ShowHUD_420x420.png; sourceTree = "<group>"; };
AB0F29A514BE7213009ABC6F /* Icon_Speaker_420x420.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Icon_Speaker_420x420.png; path = images/Icon_Speaker_420x420.png; sourceTree = "<group>"; };
AB11AE8E2E210BB400E8A516 /* GPU_Operations_AltiVec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GPU_Operations_AltiVec.cpp; sourceTree = "<group>"; };
AB11AE8F2E210BB400E8A516 /* GPU_Operations_AltiVec.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = GPU_Operations_AltiVec.h; sourceTree = "<group>"; };
AB126D06182ECB9500EBCF22 /* slot2_passme.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slot2_passme.cpp; sourceTree = "<group>"; };
AB142025186E2CD80015D52F /* Image_MemoryExpansionPak.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; name = Image_MemoryExpansionPak.png; path = images/Image_MemoryExpansionPak.png; sourceTree = "<group>"; };
AB1B20AB2AD5ED59007CA7EB /* slot2_hcv1000.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slot2_hcv1000.cpp; sourceTree = "<group>"; };
@@ -3436,6 +3438,7 @@
AB6FE66C26E6F7C2002B2106 /* GPU_Operations_SSE2.cpp */,
AB6FE66A26E6F7C2002B2106 /* GPU_Operations_AVX2.cpp */,
AB3B8DC62D8A35A000C9CBFD /* GPU_Operations_NEON.cpp */,
AB11AE8E2E210BB400E8A516 /* GPU_Operations_AltiVec.cpp */,
ABD1FEB81345AC8400AF11D1 /* lua-engine.cpp */,
ABD1FEB91345AC8400AF11D1 /* matrix.cpp */,
ABD1FEBA1345AC8400AF11D1 /* mc.cpp */,
@@ -3481,6 +3484,7 @@
AB6FE66D26E6F7C2002B2106 /* GPU_Operations_SSE2.h */,
AB6FE66B26E6F7C2002B2106 /* GPU_Operations_AVX2.h */,
AB3B8DC72D8A35A000C9CBFD /* GPU_Operations_NEON.h */,
AB11AE8F2E210BB400E8A516 /* GPU_Operations_AltiVec.h */,
ABBCE29D15ACB26100A2C965 /* instruction_attributes.h */,
ABBCE29E15ACB26100A2C965 /* instructions.h */,
ABD1FE841345AC8400AF11D1 /* lua-engine.h */,