x264 source for verification 2026-05-22
This commit is contained in:
51
.gitignore
vendored
Normal file
51
.gitignore
vendored
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
*~
|
||||||
|
*.a
|
||||||
|
*.d
|
||||||
|
*.diff
|
||||||
|
*.orig
|
||||||
|
*.rej
|
||||||
|
*.dll*
|
||||||
|
*.exe
|
||||||
|
*.def
|
||||||
|
*.lib
|
||||||
|
*.pdb
|
||||||
|
*.mo
|
||||||
|
*.o
|
||||||
|
*.patch
|
||||||
|
*.pc
|
||||||
|
*.pot
|
||||||
|
*.so*
|
||||||
|
*.dylib
|
||||||
|
.*.swp
|
||||||
|
.depend
|
||||||
|
.DS_Store
|
||||||
|
TAGS
|
||||||
|
config.h
|
||||||
|
config.mak
|
||||||
|
config.log
|
||||||
|
x264_config.h
|
||||||
|
x264
|
||||||
|
checkasm
|
||||||
|
|
||||||
|
*.264
|
||||||
|
*.h264
|
||||||
|
*.2pass
|
||||||
|
*.ffindex
|
||||||
|
*.avs
|
||||||
|
*.mkv
|
||||||
|
*.flv
|
||||||
|
*.mp4
|
||||||
|
*.y4m
|
||||||
|
*.yuv
|
||||||
|
*.log
|
||||||
|
*.mbtree
|
||||||
|
*.temp
|
||||||
|
*.pyc
|
||||||
|
*.pgd
|
||||||
|
*.pgc
|
||||||
|
|
||||||
|
.digress_x264
|
||||||
|
dataDec.txt
|
||||||
|
log.dec
|
||||||
|
common/oclobj.h
|
||||||
|
x264_lookahead.clbin
|
||||||
339
.gitlab-ci.yml
Normal file
339
.gitlab-ci.yml
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
stages:
|
||||||
|
- build
|
||||||
|
- test
|
||||||
|
- release
|
||||||
|
|
||||||
|
.variables-debian-amd64: &variables-debian-amd64
|
||||||
|
_TRIPLET: ""
|
||||||
|
_PLATFORMSUFFIX: ""
|
||||||
|
_WRAPPER: ""
|
||||||
|
|
||||||
|
.variables-debian-aarch64: &variables-debian-aarch64
|
||||||
|
_TRIPLET: ""
|
||||||
|
_PLATFORMSUFFIX: ""
|
||||||
|
_WRAPPER: ""
|
||||||
|
|
||||||
|
.variables-win32: &variables-win32
|
||||||
|
_TRIPLET: "i686-w64-mingw32"
|
||||||
|
_ARCH: "i686"
|
||||||
|
_OS: "mingw32"
|
||||||
|
_PLATFORMSUFFIX: ".exe"
|
||||||
|
_WRAPPER: "wine"
|
||||||
|
|
||||||
|
.variables-win64: &variables-win64
|
||||||
|
_TRIPLET: "x86_64-w64-mingw32"
|
||||||
|
_ARCH: "x86_64"
|
||||||
|
_OS: "mingw32"
|
||||||
|
_PLATFORMSUFFIX: ".exe"
|
||||||
|
_WRAPPER: "wine"
|
||||||
|
|
||||||
|
.variables-win-armv7: &variables-win-armv7
|
||||||
|
_TRIPLET: "armv7-w64-mingw32"
|
||||||
|
_PLATFORMSUFFIX: ".exe"
|
||||||
|
_WRAPPER: ""
|
||||||
|
|
||||||
|
.variables-win-aarch64: &variables-win-aarch64
|
||||||
|
_TRIPLET: "aarch64-w64-mingw32"
|
||||||
|
_PLATFORMSUFFIX: ".exe"
|
||||||
|
_WRAPPER: ""
|
||||||
|
|
||||||
|
.variables-macos-x86_64: &variables-macos-x86_64
|
||||||
|
_TRIPLET: "x86_64-apple-darwin19"
|
||||||
|
_ARCH: "x86_64"
|
||||||
|
_OS: "darwin"
|
||||||
|
_PLATFORMSUFFIX: ""
|
||||||
|
_WRAPPER: ""
|
||||||
|
_XCFLAGS: "-arch x86_64"
|
||||||
|
_XLDFLAGS: "-arch x86_64"
|
||||||
|
_BIN_PATH: /Users/videolanci/sandbox/bin
|
||||||
|
|
||||||
|
.variables-macos-arm64: &variables-macos-arm64
|
||||||
|
_TRIPLET: "aarch64-apple-darwin19"
|
||||||
|
_ARCH: "aarch64"
|
||||||
|
_OS: "darwin"
|
||||||
|
_PLATFORMSUFFIX: ""
|
||||||
|
_WRAPPER: ""
|
||||||
|
_XCFLAGS: "-arch arm64"
|
||||||
|
_XLDFLAGS: "-arch arm64"
|
||||||
|
_BIN_PATH: /Users/videolanci/sandbox/bin
|
||||||
|
|
||||||
|
.variables-android-arm: &variables-android-arm
|
||||||
|
_TRIPLET: "arm-linux-androideabi"
|
||||||
|
_CLANG_TRIPLET: "armv7a-linux-androideabi"
|
||||||
|
_ANDROID_VERSION: "21"
|
||||||
|
_PLATFORMSUFFIX: ""
|
||||||
|
_WRAPPER: ""
|
||||||
|
|
||||||
|
.variables-android-aarch64: &variables-android-aarch64
|
||||||
|
_TRIPLET: "aarch64-linux-android"
|
||||||
|
_CLANG_TRIPLET: "aarch64-linux-android"
|
||||||
|
_ANDROID_VERSION: "21"
|
||||||
|
_PLATFORMSUFFIX: ""
|
||||||
|
_WRAPPER: ""
|
||||||
|
|
||||||
|
.build:
|
||||||
|
stage: build
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
LOCAL_INSTALL_DIR=`pwd`/local_install
|
||||||
|
export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
|
||||||
|
git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
|
||||||
|
cd ffmpeg
|
||||||
|
./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers --extra-ldflags="-static"
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN)
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) install
|
||||||
|
cd ..
|
||||||
|
git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
|
||||||
|
cd lsmash
|
||||||
|
./configure --prefix="${LOCAL_INSTALL_DIR}" --extra-ldflags="-static"
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN)
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) install
|
||||||
|
cd ..
|
||||||
|
./configure --enable-pic --enable-strip --extra-ldflags="-static"
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
|
||||||
|
artifacts:
|
||||||
|
name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA"
|
||||||
|
paths:
|
||||||
|
- x264${_PLATFORMSUFFIX}
|
||||||
|
- checkasm8${_PLATFORMSUFFIX}
|
||||||
|
- checkasm10${_PLATFORMSUFFIX}
|
||||||
|
- config.log
|
||||||
|
expire_in: 1 week
|
||||||
|
|
||||||
|
build-debian-amd64:
|
||||||
|
extends: .build
|
||||||
|
image: registry.videolan.org/vlc-debian-unstable:20240212151604
|
||||||
|
tags:
|
||||||
|
- docker
|
||||||
|
- amd64
|
||||||
|
variables: *variables-debian-amd64
|
||||||
|
|
||||||
|
build-debian-aarch64:
|
||||||
|
extends: .build
|
||||||
|
image: registry.videolan.org/x264-debian-unstable-aarch64:20211206141032
|
||||||
|
tags:
|
||||||
|
- docker
|
||||||
|
- aarch64
|
||||||
|
variables: *variables-debian-aarch64
|
||||||
|
|
||||||
|
.build-win:
|
||||||
|
extends: build-debian-amd64
|
||||||
|
image: registry.videolan.org/vlc-debian-llvm-msvcrt:20240212151604
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET}
|
||||||
|
export PKGCONFIG=pkg-config
|
||||||
|
export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
|
||||||
|
git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
|
||||||
|
cd ffmpeg
|
||||||
|
./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --cross-prefix="${_TRIPLET}-" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN)
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) install
|
||||||
|
cd ..
|
||||||
|
git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
|
||||||
|
cd lsmash
|
||||||
|
./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --cross-prefix="${_TRIPLET}-"
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN)
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) install
|
||||||
|
cd ..
|
||||||
|
./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
|
||||||
|
|
||||||
|
build-win32:
|
||||||
|
extends: .build-win
|
||||||
|
variables: *variables-win32
|
||||||
|
|
||||||
|
build-win64:
|
||||||
|
extends: .build-win
|
||||||
|
variables: *variables-win64
|
||||||
|
|
||||||
|
.build-llvm-mingw:
|
||||||
|
extends: .build
|
||||||
|
image: registry.videolan.org/vlc-debian-llvm-ucrt:20240212151604
|
||||||
|
tags:
|
||||||
|
- docker
|
||||||
|
- amd64
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
|
||||||
|
|
||||||
|
build-llvm-mingw-armv7:
|
||||||
|
extends: .build-llvm-mingw
|
||||||
|
variables: *variables-win-armv7
|
||||||
|
|
||||||
|
build-llvm-mingw-aarch64:
|
||||||
|
extends: .build-llvm-mingw
|
||||||
|
variables: *variables-win-aarch64
|
||||||
|
|
||||||
|
.build-macos:
|
||||||
|
extends: .build
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
export PATH="${_BIN_PATH}:$PATH"
|
||||||
|
LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET}
|
||||||
|
export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
|
||||||
|
git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
|
||||||
|
cd ffmpeg
|
||||||
|
./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN)
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) install
|
||||||
|
cd ..
|
||||||
|
git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
|
||||||
|
cd lsmash
|
||||||
|
./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}"
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN)
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) install
|
||||||
|
cd ..
|
||||||
|
./configure --host="${_TRIPLET}" --enable-pic --enable-strip
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
|
||||||
|
|
||||||
|
build-macos-x86_64:
|
||||||
|
extends: .build-macos
|
||||||
|
tags:
|
||||||
|
- amd64
|
||||||
|
- monterey
|
||||||
|
variables: *variables-macos-x86_64
|
||||||
|
|
||||||
|
build-macos-arm64:
|
||||||
|
extends: .build-macos
|
||||||
|
tags:
|
||||||
|
- amd64
|
||||||
|
- monterey
|
||||||
|
variables: *variables-macos-arm64
|
||||||
|
|
||||||
|
.build-android:
|
||||||
|
extends: .build
|
||||||
|
image: registry.videolan.org/vlc-debian-android:20241118101328
|
||||||
|
tags:
|
||||||
|
- docker
|
||||||
|
- amd64
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
CC=${_CLANG_TRIPLET}${_ANDROID_VERSION}-clang AR=llvm-ar RANLIB=llvm-ranlib STRIP=llvm-strip PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --enable-pic --enable-strip
|
||||||
|
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
|
||||||
|
|
||||||
|
build-android-arm:
|
||||||
|
extends: .build-android
|
||||||
|
variables: *variables-android-arm
|
||||||
|
|
||||||
|
build-android-aarch64:
|
||||||
|
extends: .build-android
|
||||||
|
variables: *variables-android-aarch64
|
||||||
|
|
||||||
|
.test: &test
|
||||||
|
stage: test
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
${_WRAPPER} ./checkasm8${_PLATFORMSUFFIX}
|
||||||
|
${_WRAPPER} ./checkasm10${_PLATFORMSUFFIX}
|
||||||
|
artifacts:
|
||||||
|
expire_in: 10 minutes
|
||||||
|
|
||||||
|
test-debian-amd64:
|
||||||
|
<<: *test
|
||||||
|
extends: build-debian-amd64
|
||||||
|
dependencies:
|
||||||
|
- build-debian-amd64
|
||||||
|
variables: *variables-debian-amd64
|
||||||
|
|
||||||
|
test-debian-aarch64:
|
||||||
|
<<: *test
|
||||||
|
extends: build-debian-aarch64
|
||||||
|
dependencies:
|
||||||
|
- build-debian-aarch64
|
||||||
|
variables: *variables-debian-aarch64
|
||||||
|
|
||||||
|
test-win32:
|
||||||
|
<<: *test
|
||||||
|
extends: build-win32
|
||||||
|
dependencies:
|
||||||
|
- build-win32
|
||||||
|
variables: *variables-win32
|
||||||
|
|
||||||
|
test-win64:
|
||||||
|
<<: *test
|
||||||
|
extends: build-win64
|
||||||
|
dependencies:
|
||||||
|
- build-win64
|
||||||
|
variables: *variables-win64
|
||||||
|
|
||||||
|
test-macos-x86_64:
|
||||||
|
<<: *test
|
||||||
|
extends: build-macos-x86_64
|
||||||
|
dependencies:
|
||||||
|
- build-macos-x86_64
|
||||||
|
variables: *variables-macos-x86_64
|
||||||
|
|
||||||
|
test-aarch64-qemu:
|
||||||
|
<<: *test
|
||||||
|
extends: build-debian-amd64
|
||||||
|
image: registry.videolan.org/x264-debian-unstable:20231113190916
|
||||||
|
dependencies:
|
||||||
|
- build-debian-aarch64
|
||||||
|
variables: *variables-debian-amd64
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
for size in 128 256 512 1024 2048; do
|
||||||
|
for tool in checkasm8 checkasm10; do
|
||||||
|
qemu-aarch64 -cpu max,sve-default-vector-length=256,sve$size=on -L /usr/aarch64-linux-gnu ./$tool
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
.release: &release
|
||||||
|
stage: release
|
||||||
|
script: |
|
||||||
|
set -x
|
||||||
|
_VERSION=$(./version.sh | grep _VERSION -| cut -d\ -f4-| sed 's, ,-,g' | sed 's,",,')
|
||||||
|
mv x264${_PLATFORMSUFFIX} x264-${_VERSION}${_PLATFORMSUFFIX}
|
||||||
|
when: manual
|
||||||
|
only:
|
||||||
|
- master@videolan/x264
|
||||||
|
- stable@videolan/x264
|
||||||
|
artifacts:
|
||||||
|
name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA"
|
||||||
|
paths:
|
||||||
|
- x264-*${_PLATFORMSUFFIX}
|
||||||
|
expire_in: '10 minutes'
|
||||||
|
|
||||||
|
release-debian-amd64:
|
||||||
|
<<: *release
|
||||||
|
extends: build-debian-amd64
|
||||||
|
dependencies:
|
||||||
|
- build-debian-amd64
|
||||||
|
variables: *variables-debian-amd64
|
||||||
|
|
||||||
|
release-debian-aarch64:
|
||||||
|
<<: *release
|
||||||
|
extends: build-debian-aarch64
|
||||||
|
dependencies:
|
||||||
|
- build-debian-aarch64
|
||||||
|
variables: *variables-debian-aarch64
|
||||||
|
|
||||||
|
release-win32:
|
||||||
|
<<: *release
|
||||||
|
extends: build-win32
|
||||||
|
dependencies:
|
||||||
|
- build-win32
|
||||||
|
variables: *variables-win32
|
||||||
|
|
||||||
|
release-win64:
|
||||||
|
<<: *release
|
||||||
|
extends: build-win64
|
||||||
|
dependencies:
|
||||||
|
- build-win64
|
||||||
|
variables: *variables-win64
|
||||||
|
|
||||||
|
release-macos-x86_64:
|
||||||
|
<<: *release
|
||||||
|
extends: build-macos-x86_64
|
||||||
|
dependencies:
|
||||||
|
- build-macos-x86_64
|
||||||
|
variables: *variables-macos-x86_64
|
||||||
|
|
||||||
|
release-macos-arm64:
|
||||||
|
<<: *release
|
||||||
|
extends: build-macos-arm64
|
||||||
|
dependencies:
|
||||||
|
- build-macos-arm64
|
||||||
|
variables: *variables-macos-arm64
|
||||||
99
AUTHORS
Normal file
99
AUTHORS
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
# Contributors to x264
|
||||||
|
#
|
||||||
|
# The format of this file was inspired by the Linux kernel CREDITS file.
|
||||||
|
# Authors are listed alphabetically.
|
||||||
|
#
|
||||||
|
# The fields are: name (N), email (E), web-address (W), CVS account login (C),
|
||||||
|
# PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
|
||||||
|
|
||||||
|
N: Alex Izvorski
|
||||||
|
E: aizvorski AT gmail DOT com
|
||||||
|
D: x86 asm (sse2)
|
||||||
|
|
||||||
|
N: Alex Wright
|
||||||
|
E: alexw0885 AT gmail DOT com
|
||||||
|
D: Motion estimation (subpel and mixed refs)
|
||||||
|
D: B-RDO
|
||||||
|
|
||||||
|
N: bobololo
|
||||||
|
D: Avisynth input
|
||||||
|
D: MP4 muxing
|
||||||
|
|
||||||
|
N: Christian Heine
|
||||||
|
E: sennindemokrit AT gmx DOT net
|
||||||
|
D: x86 asm
|
||||||
|
|
||||||
|
N: David Wolstencroft
|
||||||
|
D: Altivec optimizations
|
||||||
|
|
||||||
|
N: Eric Petit
|
||||||
|
E: eric.petit AT lapsus DOT org
|
||||||
|
C: titer
|
||||||
|
D: Altivec asm
|
||||||
|
D: BeOS and MacOS X ports.
|
||||||
|
S: France
|
||||||
|
|
||||||
|
N: Fiona Glaser
|
||||||
|
E: fiona AT x264 DOT com
|
||||||
|
D: Maintainer
|
||||||
|
D: All areas of encoder analysis and algorithms
|
||||||
|
D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
|
||||||
|
D: x86 asm
|
||||||
|
S: USA
|
||||||
|
|
||||||
|
N: Gabriel Bouvigne
|
||||||
|
E: bouvigne AT mp3-tech DOT org
|
||||||
|
D: 2pass VBV
|
||||||
|
|
||||||
|
N: Guillaume Poirier
|
||||||
|
E: gpoirier CHEZ mplayerhq POINT hu
|
||||||
|
D: Altivec optimizations
|
||||||
|
S: Brittany, France
|
||||||
|
|
||||||
|
N: Henrik Gramner
|
||||||
|
E: henrik AT gramner DOT com
|
||||||
|
D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes
|
||||||
|
S: Sweden
|
||||||
|
|
||||||
|
N: Laurent Aimar
|
||||||
|
E: fenrir AT videolan DOT org
|
||||||
|
C: fenrir
|
||||||
|
D: Initial import, former maintainer
|
||||||
|
D: x86 asm (mmx/mmx2)
|
||||||
|
S: France
|
||||||
|
|
||||||
|
N: Loren Merritt
|
||||||
|
E: pengvado AT akuvian DOT org
|
||||||
|
C: pengvado
|
||||||
|
D: Maintainer
|
||||||
|
D: All areas of encoder analysis and algorithms
|
||||||
|
D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
|
||||||
|
D: Multithreading
|
||||||
|
D: x86 asm
|
||||||
|
S: USA
|
||||||
|
|
||||||
|
N: Mans Rullgard
|
||||||
|
E: mru AT mansr DOT com
|
||||||
|
C: mru
|
||||||
|
D: Rate control
|
||||||
|
S: Southampton, UK
|
||||||
|
|
||||||
|
N: Michael Niedermayer
|
||||||
|
E: michaelni AT gmx DOT at
|
||||||
|
D: Rate control
|
||||||
|
|
||||||
|
N: Mike Matsnev
|
||||||
|
E: mike AT po DOT cs DOT msu DOT su
|
||||||
|
D: Matroska muxing
|
||||||
|
|
||||||
|
N: Min Chen
|
||||||
|
E: chenm001 AT 163 DOT com
|
||||||
|
C: chenm001
|
||||||
|
D: Win32/VC 6.0 port
|
||||||
|
D: gcc asm to nasm conversion
|
||||||
|
S: China
|
||||||
|
|
||||||
|
N: Radek Czyz
|
||||||
|
E: radoslaw AT syskin DOT cjb DOT net
|
||||||
|
D: Cached motion compensation
|
||||||
|
|
||||||
340
COPYING
Normal file
340
COPYING
Normal file
@@ -0,0 +1,340 @@
|
|||||||
|
GNU GENERAL PUBLIC LICENSE
|
||||||
|
Version 2, June 1991
|
||||||
|
|
||||||
|
Copyright (C) 1989, 1991 Free Software Foundation, Inc.
|
||||||
|
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
Everyone is permitted to copy and distribute verbatim copies
|
||||||
|
of this license document, but changing it is not allowed.
|
||||||
|
|
||||||
|
Preamble
|
||||||
|
|
||||||
|
The licenses for most software are designed to take away your
|
||||||
|
freedom to share and change it. By contrast, the GNU General Public
|
||||||
|
License is intended to guarantee your freedom to share and change free
|
||||||
|
software--to make sure the software is free for all its users. This
|
||||||
|
General Public License applies to most of the Free Software
|
||||||
|
Foundation's software and to any other program whose authors commit to
|
||||||
|
using it. (Some other Free Software Foundation software is covered by
|
||||||
|
the GNU Library General Public License instead.) You can apply it to
|
||||||
|
your programs, too.
|
||||||
|
|
||||||
|
When we speak of free software, we are referring to freedom, not
|
||||||
|
price. Our General Public Licenses are designed to make sure that you
|
||||||
|
have the freedom to distribute copies of free software (and charge for
|
||||||
|
this service if you wish), that you receive source code or can get it
|
||||||
|
if you want it, that you can change the software or use pieces of it
|
||||||
|
in new free programs; and that you know you can do these things.
|
||||||
|
|
||||||
|
To protect your rights, we need to make restrictions that forbid
|
||||||
|
anyone to deny you these rights or to ask you to surrender the rights.
|
||||||
|
These restrictions translate to certain responsibilities for you if you
|
||||||
|
distribute copies of the software, or if you modify it.
|
||||||
|
|
||||||
|
For example, if you distribute copies of such a program, whether
|
||||||
|
gratis or for a fee, you must give the recipients all the rights that
|
||||||
|
you have. You must make sure that they, too, receive or can get the
|
||||||
|
source code. And you must show them these terms so they know their
|
||||||
|
rights.
|
||||||
|
|
||||||
|
We protect your rights with two steps: (1) copyright the software, and
|
||||||
|
(2) offer you this license which gives you legal permission to copy,
|
||||||
|
distribute and/or modify the software.
|
||||||
|
|
||||||
|
Also, for each author's protection and ours, we want to make certain
|
||||||
|
that everyone understands that there is no warranty for this free
|
||||||
|
software. If the software is modified by someone else and passed on, we
|
||||||
|
want its recipients to know that what they have is not the original, so
|
||||||
|
that any problems introduced by others will not reflect on the original
|
||||||
|
authors' reputations.
|
||||||
|
|
||||||
|
Finally, any free program is threatened constantly by software
|
||||||
|
patents. We wish to avoid the danger that redistributors of a free
|
||||||
|
program will individually obtain patent licenses, in effect making the
|
||||||
|
program proprietary. To prevent this, we have made it clear that any
|
||||||
|
patent must be licensed for everyone's free use or not licensed at all.
|
||||||
|
|
||||||
|
The precise terms and conditions for copying, distribution and
|
||||||
|
modification follow.
|
||||||
|
|
||||||
|
GNU GENERAL PUBLIC LICENSE
|
||||||
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||||
|
|
||||||
|
0. This License applies to any program or other work which contains
|
||||||
|
a notice placed by the copyright holder saying it may be distributed
|
||||||
|
under the terms of this General Public License. The "Program", below,
|
||||||
|
refers to any such program or work, and a "work based on the Program"
|
||||||
|
means either the Program or any derivative work under copyright law:
|
||||||
|
that is to say, a work containing the Program or a portion of it,
|
||||||
|
either verbatim or with modifications and/or translated into another
|
||||||
|
language. (Hereinafter, translation is included without limitation in
|
||||||
|
the term "modification".) Each licensee is addressed as "you".
|
||||||
|
|
||||||
|
Activities other than copying, distribution and modification are not
|
||||||
|
covered by this License; they are outside its scope. The act of
|
||||||
|
running the Program is not restricted, and the output from the Program
|
||||||
|
is covered only if its contents constitute a work based on the
|
||||||
|
Program (independent of having been made by running the Program).
|
||||||
|
Whether that is true depends on what the Program does.
|
||||||
|
|
||||||
|
1. You may copy and distribute verbatim copies of the Program's
|
||||||
|
source code as you receive it, in any medium, provided that you
|
||||||
|
conspicuously and appropriately publish on each copy an appropriate
|
||||||
|
copyright notice and disclaimer of warranty; keep intact all the
|
||||||
|
notices that refer to this License and to the absence of any warranty;
|
||||||
|
and give any other recipients of the Program a copy of this License
|
||||||
|
along with the Program.
|
||||||
|
|
||||||
|
You may charge a fee for the physical act of transferring a copy, and
|
||||||
|
you may at your option offer warranty protection in exchange for a fee.
|
||||||
|
|
||||||
|
2. You may modify your copy or copies of the Program or any portion
|
||||||
|
of it, thus forming a work based on the Program, and copy and
|
||||||
|
distribute such modifications or work under the terms of Section 1
|
||||||
|
above, provided that you also meet all of these conditions:
|
||||||
|
|
||||||
|
a) You must cause the modified files to carry prominent notices
|
||||||
|
stating that you changed the files and the date of any change.
|
||||||
|
|
||||||
|
b) You must cause any work that you distribute or publish, that in
|
||||||
|
whole or in part contains or is derived from the Program or any
|
||||||
|
part thereof, to be licensed as a whole at no charge to all third
|
||||||
|
parties under the terms of this License.
|
||||||
|
|
||||||
|
c) If the modified program normally reads commands interactively
|
||||||
|
when run, you must cause it, when started running for such
|
||||||
|
interactive use in the most ordinary way, to print or display an
|
||||||
|
announcement including an appropriate copyright notice and a
|
||||||
|
notice that there is no warranty (or else, saying that you provide
|
||||||
|
a warranty) and that users may redistribute the program under
|
||||||
|
these conditions, and telling the user how to view a copy of this
|
||||||
|
License. (Exception: if the Program itself is interactive but
|
||||||
|
does not normally print such an announcement, your work based on
|
||||||
|
the Program is not required to print an announcement.)
|
||||||
|
|
||||||
|
These requirements apply to the modified work as a whole. If
|
||||||
|
identifiable sections of that work are not derived from the Program,
|
||||||
|
and can be reasonably considered independent and separate works in
|
||||||
|
themselves, then this License, and its terms, do not apply to those
|
||||||
|
sections when you distribute them as separate works. But when you
|
||||||
|
distribute the same sections as part of a whole which is a work based
|
||||||
|
on the Program, the distribution of the whole must be on the terms of
|
||||||
|
this License, whose permissions for other licensees extend to the
|
||||||
|
entire whole, and thus to each and every part regardless of who wrote it.
|
||||||
|
|
||||||
|
Thus, it is not the intent of this section to claim rights or contest
|
||||||
|
your rights to work written entirely by you; rather, the intent is to
|
||||||
|
exercise the right to control the distribution of derivative or
|
||||||
|
collective works based on the Program.
|
||||||
|
|
||||||
|
In addition, mere aggregation of another work not based on the Program
|
||||||
|
with the Program (or with a work based on the Program) on a volume of
|
||||||
|
a storage or distribution medium does not bring the other work under
|
||||||
|
the scope of this License.
|
||||||
|
|
||||||
|
3. You may copy and distribute the Program (or a work based on it,
|
||||||
|
under Section 2) in object code or executable form under the terms of
|
||||||
|
Sections 1 and 2 above provided that you also do one of the following:
|
||||||
|
|
||||||
|
a) Accompany it with the complete corresponding machine-readable
|
||||||
|
source code, which must be distributed under the terms of Sections
|
||||||
|
1 and 2 above on a medium customarily used for software interchange; or,
|
||||||
|
|
||||||
|
b) Accompany it with a written offer, valid for at least three
|
||||||
|
years, to give any third party, for a charge no more than your
|
||||||
|
cost of physically performing source distribution, a complete
|
||||||
|
machine-readable copy of the corresponding source code, to be
|
||||||
|
distributed under the terms of Sections 1 and 2 above on a medium
|
||||||
|
customarily used for software interchange; or,
|
||||||
|
|
||||||
|
c) Accompany it with the information you received as to the offer
|
||||||
|
to distribute corresponding source code. (This alternative is
|
||||||
|
allowed only for noncommercial distribution and only if you
|
||||||
|
received the program in object code or executable form with such
|
||||||
|
an offer, in accord with Subsection b above.)
|
||||||
|
|
||||||
|
The source code for a work means the preferred form of the work for
|
||||||
|
making modifications to it. For an executable work, complete source
|
||||||
|
code means all the source code for all modules it contains, plus any
|
||||||
|
associated interface definition files, plus the scripts used to
|
||||||
|
control compilation and installation of the executable. However, as a
|
||||||
|
special exception, the source code distributed need not include
|
||||||
|
anything that is normally distributed (in either source or binary
|
||||||
|
form) with the major components (compiler, kernel, and so on) of the
|
||||||
|
operating system on which the executable runs, unless that component
|
||||||
|
itself accompanies the executable.
|
||||||
|
|
||||||
|
If distribution of executable or object code is made by offering
|
||||||
|
access to copy from a designated place, then offering equivalent
|
||||||
|
access to copy the source code from the same place counts as
|
||||||
|
distribution of the source code, even though third parties are not
|
||||||
|
compelled to copy the source along with the object code.
|
||||||
|
|
||||||
|
4. You may not copy, modify, sublicense, or distribute the Program
|
||||||
|
except as expressly provided under this License. Any attempt
|
||||||
|
otherwise to copy, modify, sublicense or distribute the Program is
|
||||||
|
void, and will automatically terminate your rights under this License.
|
||||||
|
However, parties who have received copies, or rights, from you under
|
||||||
|
this License will not have their licenses terminated so long as such
|
||||||
|
parties remain in full compliance.
|
||||||
|
|
||||||
|
5. You are not required to accept this License, since you have not
|
||||||
|
signed it. However, nothing else grants you permission to modify or
|
||||||
|
distribute the Program or its derivative works. These actions are
|
||||||
|
prohibited by law if you do not accept this License. Therefore, by
|
||||||
|
modifying or distributing the Program (or any work based on the
|
||||||
|
Program), you indicate your acceptance of this License to do so, and
|
||||||
|
all its terms and conditions for copying, distributing or modifying
|
||||||
|
the Program or works based on it.
|
||||||
|
|
||||||
|
6. Each time you redistribute the Program (or any work based on the
|
||||||
|
Program), the recipient automatically receives a license from the
|
||||||
|
original licensor to copy, distribute or modify the Program subject to
|
||||||
|
these terms and conditions. You may not impose any further
|
||||||
|
restrictions on the recipients' exercise of the rights granted herein.
|
||||||
|
You are not responsible for enforcing compliance by third parties to
|
||||||
|
this License.
|
||||||
|
|
||||||
|
7. If, as a consequence of a court judgment or allegation of patent
|
||||||
|
infringement or for any other reason (not limited to patent issues),
|
||||||
|
conditions are imposed on you (whether by court order, agreement or
|
||||||
|
otherwise) that contradict the conditions of this License, they do not
|
||||||
|
excuse you from the conditions of this License. If you cannot
|
||||||
|
distribute so as to satisfy simultaneously your obligations under this
|
||||||
|
License and any other pertinent obligations, then as a consequence you
|
||||||
|
may not distribute the Program at all. For example, if a patent
|
||||||
|
license would not permit royalty-free redistribution of the Program by
|
||||||
|
all those who receive copies directly or indirectly through you, then
|
||||||
|
the only way you could satisfy both it and this License would be to
|
||||||
|
refrain entirely from distribution of the Program.
|
||||||
|
|
||||||
|
If any portion of this section is held invalid or unenforceable under
|
||||||
|
any particular circumstance, the balance of the section is intended to
|
||||||
|
apply and the section as a whole is intended to apply in other
|
||||||
|
circumstances.
|
||||||
|
|
||||||
|
It is not the purpose of this section to induce you to infringe any
|
||||||
|
patents or other property right claims or to contest validity of any
|
||||||
|
such claims; this section has the sole purpose of protecting the
|
||||||
|
integrity of the free software distribution system, which is
|
||||||
|
implemented by public license practices. Many people have made
|
||||||
|
generous contributions to the wide range of software distributed
|
||||||
|
through that system in reliance on consistent application of that
|
||||||
|
system; it is up to the author/donor to decide if he or she is willing
|
||||||
|
to distribute software through any other system and a licensee cannot
|
||||||
|
impose that choice.
|
||||||
|
|
||||||
|
This section is intended to make thoroughly clear what is believed to
|
||||||
|
be a consequence of the rest of this License.
|
||||||
|
|
||||||
|
8. If the distribution and/or use of the Program is restricted in
|
||||||
|
certain countries either by patents or by copyrighted interfaces, the
|
||||||
|
original copyright holder who places the Program under this License
|
||||||
|
may add an explicit geographical distribution limitation excluding
|
||||||
|
those countries, so that distribution is permitted only in or among
|
||||||
|
countries not thus excluded. In such case, this License incorporates
|
||||||
|
the limitation as if written in the body of this License.
|
||||||
|
|
||||||
|
9. The Free Software Foundation may publish revised and/or new versions
|
||||||
|
of the General Public License from time to time. Such new versions will
|
||||||
|
be similar in spirit to the present version, but may differ in detail to
|
||||||
|
address new problems or concerns.
|
||||||
|
|
||||||
|
Each version is given a distinguishing version number. If the Program
|
||||||
|
specifies a version number of this License which applies to it and "any
|
||||||
|
later version", you have the option of following the terms and conditions
|
||||||
|
either of that version or of any later version published by the Free
|
||||||
|
Software Foundation. If the Program does not specify a version number of
|
||||||
|
this License, you may choose any version ever published by the Free Software
|
||||||
|
Foundation.
|
||||||
|
|
||||||
|
10. If you wish to incorporate parts of the Program into other free
|
||||||
|
programs whose distribution conditions are different, write to the author
|
||||||
|
to ask for permission. For software which is copyrighted by the Free
|
||||||
|
Software Foundation, write to the Free Software Foundation; we sometimes
|
||||||
|
make exceptions for this. Our decision will be guided by the two goals
|
||||||
|
of preserving the free status of all derivatives of our free software and
|
||||||
|
of promoting the sharing and reuse of software generally.
|
||||||
|
|
||||||
|
NO WARRANTY
|
||||||
|
|
||||||
|
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||||
|
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||||
|
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||||
|
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
||||||
|
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
||||||
|
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
||||||
|
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
||||||
|
REPAIR OR CORRECTION.
|
||||||
|
|
||||||
|
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||||
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||||
|
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
||||||
|
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
||||||
|
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
||||||
|
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
||||||
|
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
||||||
|
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
||||||
|
POSSIBILITY OF SUCH DAMAGES.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
How to Apply These Terms to Your New Programs
|
||||||
|
|
||||||
|
If you develop a new program, and you want it to be of the greatest
|
||||||
|
possible use to the public, the best way to achieve this is to make it
|
||||||
|
free software which everyone can redistribute and change under these terms.
|
||||||
|
|
||||||
|
To do so, attach the following notices to the program. It is safest
|
||||||
|
to attach them to the start of each source file to most effectively
|
||||||
|
convey the exclusion of warranty; and each file should have at least
|
||||||
|
the "copyright" line and a pointer to where the full notice is found.
|
||||||
|
|
||||||
|
<one line to give the program's name and a brief idea of what it does.>
|
||||||
|
Copyright (C) <year> <name of author>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
|
||||||
|
Also add information on how to contact you by electronic and paper mail.
|
||||||
|
|
||||||
|
If the program is interactive, make it output a short notice like this
|
||||||
|
when it starts in an interactive mode:
|
||||||
|
|
||||||
|
Gnomovision version 69, Copyright (C) year name of author
|
||||||
|
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||||
|
This is free software, and you are welcome to redistribute it
|
||||||
|
under certain conditions; type `show c' for details.
|
||||||
|
|
||||||
|
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||||
|
parts of the General Public License. Of course, the commands you use may
|
||||||
|
be called something other than `show w' and `show c'; they could even be
|
||||||
|
mouse-clicks or menu items--whatever suits your program.
|
||||||
|
|
||||||
|
You should also get your employer (if you work as a programmer) or your
|
||||||
|
school, if any, to sign a "copyright disclaimer" for the program, if
|
||||||
|
necessary. Here is a sample; alter the names:
|
||||||
|
|
||||||
|
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
||||||
|
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
||||||
|
|
||||||
|
<signature of Ty Coon>, 1 April 1989
|
||||||
|
Ty Coon, President of Vice
|
||||||
|
|
||||||
|
This General Public License does not permit incorporating your program into
|
||||||
|
proprietary programs. If your program is a subroutine library, you may
|
||||||
|
consider it more useful to permit linking proprietary applications with the
|
||||||
|
library. If this is what you want to do, use the GNU Library General
|
||||||
|
Public License instead of this License.
|
||||||
482
Makefile
Normal file
482
Makefile
Normal file
@@ -0,0 +1,482 @@
|
|||||||
|
# Makefile
|
||||||
|
|
||||||
|
include config.mak
|
||||||
|
|
||||||
|
vpath %.c $(SRCPATH)
|
||||||
|
vpath %.h $(SRCPATH)
|
||||||
|
vpath %.S $(SRCPATH)
|
||||||
|
vpath %.asm $(SRCPATH)
|
||||||
|
vpath %.rc $(SRCPATH)
|
||||||
|
vpath %.manifest $(SRCPATH)
|
||||||
|
|
||||||
|
CFLAGS += $(CFLAGSPROF)
|
||||||
|
LDFLAGS += $(LDFLAGSPROF)
|
||||||
|
|
||||||
|
GENERATED =
|
||||||
|
|
||||||
|
all: default
|
||||||
|
default:
|
||||||
|
|
||||||
|
SRCS = common/osdep.c common/base.c common/cpu.c common/tables.c \
|
||||||
|
encoder/api.c
|
||||||
|
|
||||||
|
SRCS_X = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
|
||||||
|
common/frame.c common/dct.c common/cabac.c \
|
||||||
|
common/common.c common/rectangle.c \
|
||||||
|
common/set.c common/quant.c common/deblock.c common/vlc.c \
|
||||||
|
common/mvpred.c common/bitstream.c \
|
||||||
|
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
|
||||||
|
encoder/set.c encoder/macroblock.c encoder/cabac.c \
|
||||||
|
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
|
||||||
|
|
||||||
|
SRCS_8 =
|
||||||
|
|
||||||
|
SRCCLI = x264.c autocomplete.c input/input.c input/timecode.c input/raw.c \
|
||||||
|
input/y4m.c output/raw.c output/matroska.c output/matroska_ebml.c \
|
||||||
|
output/flv.c output/flv_bytestream.c filters/filters.c \
|
||||||
|
filters/video/video.c filters/video/source.c filters/video/internal.c \
|
||||||
|
filters/video/resize.c filters/video/fix_vfr_pts.c \
|
||||||
|
filters/video/select_every.c filters/video/crop.c
|
||||||
|
|
||||||
|
SRCCLI_X = filters/video/cache.c filters/video/depth.c
|
||||||
|
|
||||||
|
SRCSO =
|
||||||
|
|
||||||
|
SRCCHK_X = tools/checkasm.c
|
||||||
|
|
||||||
|
SRCEXAMPLE = example.c
|
||||||
|
|
||||||
|
OBJS =
|
||||||
|
OBJASM =
|
||||||
|
OBJSO =
|
||||||
|
OBJCLI =
|
||||||
|
OBJCHK =
|
||||||
|
OBJCHK_8 =
|
||||||
|
OBJCHK_10 =
|
||||||
|
OBJEXAMPLE =
|
||||||
|
|
||||||
|
CONFIG := $(shell cat config.h)
|
||||||
|
|
||||||
|
# Optional module sources
|
||||||
|
ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),)
|
||||||
|
SRCCLI += input/avs.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),)
|
||||||
|
SRCS_X += common/threadpool.c
|
||||||
|
SRCCLI_X += input/thread.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),)
|
||||||
|
SRCS += common/win32thread.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),)
|
||||||
|
SRCCLI += input/lavf.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),)
|
||||||
|
SRCCLI += input/ffms.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),)
|
||||||
|
SRCCLI += output/mp4.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(findstring HAVE_LSMASH 1, $(CONFIG)),)
|
||||||
|
SRCCLI += output/mp4_lsmash.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(AS),)
|
||||||
|
|
||||||
|
# MMX/SSE optims
|
||||||
|
SRCASM_X =
|
||||||
|
ifeq ($(SYS_ARCH),X86)
|
||||||
|
ARCH_X86 = yes
|
||||||
|
SRCASM_X += common/x86/dct-32.asm \
|
||||||
|
common/x86/pixel-32.asm
|
||||||
|
endif
|
||||||
|
ifeq ($(SYS_ARCH),X86_64)
|
||||||
|
ARCH_X86 = yes
|
||||||
|
SRCASM_X += common/x86/dct-64.asm \
|
||||||
|
common/x86/trellis-64.asm
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef ARCH_X86
|
||||||
|
SRCASM_X += common/x86/bitstream-a.asm \
|
||||||
|
common/x86/const-a.asm \
|
||||||
|
common/x86/cabac-a.asm \
|
||||||
|
common/x86/dct-a.asm \
|
||||||
|
common/x86/deblock-a.asm \
|
||||||
|
common/x86/mc-a.asm \
|
||||||
|
common/x86/mc-a2.asm \
|
||||||
|
common/x86/pixel-a.asm \
|
||||||
|
common/x86/predict-a.asm \
|
||||||
|
common/x86/quant-a.asm
|
||||||
|
SRCS_X += common/x86/mc-c.c \
|
||||||
|
common/x86/predict-c.c
|
||||||
|
|
||||||
|
OBJASM += common/x86/cpu-a.o
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.asm=%-8.o) common/x86/sad-a-8.o
|
||||||
|
endif
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.asm=%-10.o) common/x86/sad16-a-10.o
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJCHK += tools/checkasm-a.o
|
||||||
|
endif
|
||||||
|
|
||||||
|
# AltiVec optims
|
||||||
|
ifeq ($(SYS_ARCH),PPC)
|
||||||
|
SRCS_X += common/ppc/dct.c \
|
||||||
|
common/ppc/deblock.c \
|
||||||
|
common/ppc/mc.c \
|
||||||
|
common/ppc/pixel.c \
|
||||||
|
common/ppc/predict.c \
|
||||||
|
common/ppc/quant.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
# NEON optims
|
||||||
|
ifeq ($(SYS_ARCH),ARM)
|
||||||
|
SRCASM_X = common/arm/bitstream-a.S \
|
||||||
|
common/arm/dct-a.S \
|
||||||
|
common/arm/deblock-a.S \
|
||||||
|
common/arm/mc-a.S \
|
||||||
|
common/arm/pixel-a.S \
|
||||||
|
common/arm/predict-a.S \
|
||||||
|
common/arm/quant-a.S
|
||||||
|
SRCS_X += common/arm/mc-c.c \
|
||||||
|
common/arm/predict-c.c
|
||||||
|
|
||||||
|
OBJASM += common/arm/cpu-a.o
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-8.o)
|
||||||
|
endif
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-10.o)
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJCHK += tools/checkasm-arm.o
|
||||||
|
endif
|
||||||
|
|
||||||
|
# AArch64 NEON and SVE/SVE2 optims
|
||||||
|
ifeq ($(SYS_ARCH),AARCH64)
|
||||||
|
SRCASM_X = common/aarch64/bitstream-a.S \
|
||||||
|
common/aarch64/cabac-a.S \
|
||||||
|
common/aarch64/dct-a.S \
|
||||||
|
common/aarch64/deblock-a.S \
|
||||||
|
common/aarch64/mc-a.S \
|
||||||
|
common/aarch64/pixel-a.S \
|
||||||
|
common/aarch64/predict-a.S \
|
||||||
|
common/aarch64/quant-a.S
|
||||||
|
ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),)
|
||||||
|
SRCASM_X += common/aarch64/dct-a-sve.S \
|
||||||
|
common/aarch64/deblock-a-sve.S \
|
||||||
|
common/aarch64/mc-a-sve.S \
|
||||||
|
common/aarch64/pixel-a-sve.S
|
||||||
|
endif
|
||||||
|
ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),)
|
||||||
|
SRCASM_X += common/aarch64/dct-a-sve2.S
|
||||||
|
endif
|
||||||
|
SRCS_X += common/aarch64/asm-offsets.c \
|
||||||
|
common/aarch64/mc-c.c \
|
||||||
|
common/aarch64/predict-c.c
|
||||||
|
|
||||||
|
OBJASM +=
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-8.o)
|
||||||
|
endif
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-10.o)
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJCHK += tools/checkasm-aarch64.o
|
||||||
|
endif
|
||||||
|
|
||||||
|
# RISCV64 RVV optims
|
||||||
|
ifeq ($(SYS_ARCH),RISCV64)
|
||||||
|
ifneq ($(findstring HAVE_RVV 1, $(CONFIG)),)
|
||||||
|
SRCASM_X =
|
||||||
|
|
||||||
|
SRCS_X +=
|
||||||
|
|
||||||
|
OBJASM +=
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-8.o)
|
||||||
|
endif
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-10.o)
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJCHK +=
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
# MSA optims
|
||||||
|
ifeq ($(SYS_ARCH),MIPS)
|
||||||
|
ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
|
||||||
|
SRCS_X += common/mips/dct-c.c \
|
||||||
|
common/mips/deblock-c.c \
|
||||||
|
common/mips/mc-c.c \
|
||||||
|
common/mips/pixel-c.c \
|
||||||
|
common/mips/predict-c.c \
|
||||||
|
common/mips/quant-c.c
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
# LOONGARCH optimization
|
||||||
|
ifeq ($(SYS_ARCH),LOONGARCH)
|
||||||
|
ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),)
|
||||||
|
SRCASM_X += common/loongarch/deblock-a.S \
|
||||||
|
common/loongarch/sad-a.S \
|
||||||
|
common/loongarch/predict-a.S \
|
||||||
|
common/loongarch/quant-a.S \
|
||||||
|
common/loongarch/mc-a.S \
|
||||||
|
common/loongarch/dct-a.S \
|
||||||
|
common/loongarch/pixel-a.S
|
||||||
|
|
||||||
|
SRCS_X += common/loongarch/predict-c.c \
|
||||||
|
common/loongarch/mc-c.c \
|
||||||
|
common/loongarch/pixel-c.c
|
||||||
|
|
||||||
|
OBJASM +=
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-8.o)
|
||||||
|
endif
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
|
||||||
|
OBJASM += $(SRCASM_X:%.S=%-10.o)
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJCHK += tools/checkasm-loongarch.o
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifneq ($(HAVE_GETOPT_LONG),1)
|
||||||
|
SRCCLI += extras/getopt.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(SYS),WINDOWS)
|
||||||
|
OBJCLI += $(if $(RC), x264res.o)
|
||||||
|
ifneq ($(SONAME),)
|
||||||
|
SRCSO += x264dll.c
|
||||||
|
OBJSO += $(if $(RC), x264res.dll.o)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(HAVE_OPENCL),yes)
|
||||||
|
common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
|
||||||
|
cat $^ | $(SRCPATH)/tools/cltostr.sh $@
|
||||||
|
GENERATED += common/oclobj.h
|
||||||
|
SRCS_8 += common/opencl.c encoder/slicetype-cl.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJS += $(SRCS:%.c=%.o)
|
||||||
|
OBJCLI += $(SRCCLI:%.c=%.o)
|
||||||
|
OBJSO += $(SRCSO:%.c=%.o)
|
||||||
|
OBJEXAMPLE += $(SRCEXAMPLE:%.c=%.o)
|
||||||
|
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
|
||||||
|
OBJS += $(SRCS_X:%.c=%-8.o) $(SRCS_8:%.c=%-8.o)
|
||||||
|
OBJCLI += $(SRCCLI_X:%.c=%-8.o)
|
||||||
|
OBJCHK_8 += $(SRCCHK_X:%.c=%-8.o)
|
||||||
|
checkasm: checkasm8$(EXE)
|
||||||
|
endif
|
||||||
|
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
|
||||||
|
OBJS += $(SRCS_X:%.c=%-10.o)
|
||||||
|
OBJCLI += $(SRCCLI_X:%.c=%-10.o)
|
||||||
|
OBJCHK_10 += $(SRCCHK_X:%.c=%-10.o)
|
||||||
|
checkasm: checkasm10$(EXE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* checkasm etags
|
||||||
|
|
||||||
|
cli: x264$(EXE)
|
||||||
|
lib-static: $(LIBX264)
|
||||||
|
lib-shared: $(SONAME)
|
||||||
|
|
||||||
|
$(LIBX264): $(OBJS) $(OBJASM)
|
||||||
|
rm -f $(LIBX264)
|
||||||
|
$(AR)$@ $(OBJS) $(OBJASM)
|
||||||
|
$(if $(RANLIB), $(RANLIB) $@)
|
||||||
|
|
||||||
|
$(SONAME): $(OBJS) $(OBJASM) $(OBJSO)
|
||||||
|
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
|
||||||
|
|
||||||
|
$(IMPLIBNAME): $(SONAME)
|
||||||
|
|
||||||
|
ifneq ($(EXE),)
|
||||||
|
.PHONY: x264 checkasm8 checkasm10 example
|
||||||
|
x264: x264$(EXE)
|
||||||
|
checkasm8: checkasm8$(EXE)
|
||||||
|
checkasm10: checkasm10$(EXE)
|
||||||
|
example: example$(EXE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
x264$(EXE): $(OBJCLI) $(CLI_LIBX264)
|
||||||
|
$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
|
||||||
|
|
||||||
|
checkasm8$(EXE): $(OBJCHK) $(OBJCHK_8) $(LIBX264)
|
||||||
|
$(LD)$@ $(OBJCHK) $(OBJCHK_8) $(LIBX264) $(LDFLAGS)
|
||||||
|
|
||||||
|
checkasm10$(EXE): $(OBJCHK) $(OBJCHK_10) $(LIBX264)
|
||||||
|
$(LD)$@ $(OBJCHK) $(OBJCHK_10) $(LIBX264) $(LDFLAGS)
|
||||||
|
|
||||||
|
example$(EXE): $(OBJEXAMPLE) $(LIBX264)
|
||||||
|
$(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS)
|
||||||
|
|
||||||
|
$(OBJS) $(OBJSO): CFLAGS += $(CFLAGSSO)
|
||||||
|
$(OBJCLI): CFLAGS += $(CFLAGSCLI)
|
||||||
|
|
||||||
|
ALLOBJS = $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10) $(OBJEXAMPLE)
|
||||||
|
$(ALLOBJS): $(GENERATED)
|
||||||
|
|
||||||
|
%.o: %.c
|
||||||
|
$(DEPCMD)
|
||||||
|
$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS)
|
||||||
|
|
||||||
|
%-8.o: %.c
|
||||||
|
$(DEPCMD)
|
||||||
|
$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
|
||||||
|
|
||||||
|
%-10.o: %.c
|
||||||
|
$(DEPCMD)
|
||||||
|
$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
|
||||||
|
|
||||||
|
%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
|
||||||
|
$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d)
|
||||||
|
-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
|
||||||
|
|
||||||
|
%-8.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
|
||||||
|
$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=8 -Dprivate_prefix=x264_8
|
||||||
|
-@ $(if $(STRIP), $(STRIP) -x $@)
|
||||||
|
|
||||||
|
%-10.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
|
||||||
|
$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=10 -Dprivate_prefix=x264_10
|
||||||
|
-@ $(if $(STRIP), $(STRIP) -x $@)
|
||||||
|
|
||||||
|
%.o: %.S
|
||||||
|
$(DEPCMD)
|
||||||
|
$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS)
|
||||||
|
-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
|
||||||
|
|
||||||
|
%-8.o: %.S
|
||||||
|
$(DEPCMD)
|
||||||
|
$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
|
||||||
|
-@ $(if $(STRIP), $(STRIP) -x $@)
|
||||||
|
|
||||||
|
%-10.o: %.S
|
||||||
|
$(DEPCMD)
|
||||||
|
$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
|
||||||
|
-@ $(if $(STRIP), $(STRIP) -x $@)
|
||||||
|
|
||||||
|
%.dll.o: %.rc x264.h
|
||||||
|
$(RC) $(RCFLAGS)$@ -DDLL $<
|
||||||
|
|
||||||
|
%.o: %.rc x264.h x264res.manifest
|
||||||
|
$(RC) $(RCFLAGS)$@ $<
|
||||||
|
|
||||||
|
config.mak:
|
||||||
|
./configure
|
||||||
|
|
||||||
|
# This is kept as a no-op
|
||||||
|
depend:
|
||||||
|
@echo "make depend" is handled implicitly now
|
||||||
|
|
||||||
|
-include $(wildcard $(ALLOBJS:.o=.d))
|
||||||
|
|
||||||
|
# Dummy rule to avoid failing, if the dependency files specify dependencies on
|
||||||
|
# a removed .h file.
|
||||||
|
%.h:
|
||||||
|
@:
|
||||||
|
|
||||||
|
OBJPROF = $(OBJS) $(OBJSO) $(OBJCLI)
|
||||||
|
# These should cover most of the important codepaths
|
||||||
|
OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
|
||||||
|
OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50
|
||||||
|
OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500
|
||||||
|
OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree
|
||||||
|
OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4
|
||||||
|
OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2
|
||||||
|
OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall
|
||||||
|
OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac
|
||||||
|
|
||||||
|
ifeq (,$(VIDS))
|
||||||
|
fprofiled:
|
||||||
|
@echo 'usage: make fprofiled VIDS="infile1 infile2 ..."'
|
||||||
|
@echo 'where infiles are anything that x264 understands,'
|
||||||
|
@echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.'
|
||||||
|
else
|
||||||
|
fprofiled: clean
|
||||||
|
$(MAKE) x264$(EXE) CFLAGSPROF="$(PROF_GEN_CC)" LDFLAGSPROF="$(PROF_GEN_LD)"
|
||||||
|
$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
|
||||||
|
ifeq ($(COMPILER),CL)
|
||||||
|
# Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted
|
||||||
|
rm -f x264$(EXE)
|
||||||
|
else
|
||||||
|
rm -f $(OBJPROF)
|
||||||
|
endif
|
||||||
|
$(MAKE) CFLAGSPROF="$(PROF_USE_CC)" LDFLAGSPROF="$(PROF_USE_LD)"
|
||||||
|
rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
|
||||||
|
endif
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(GENERATED) TAGS
|
||||||
|
rm -f $(SONAME) *.a *.lib *.exp *.pdb x264$(EXE) x264_lookahead.clbin
|
||||||
|
rm -f checkasm8$(EXE) checkasm10$(EXE) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10)
|
||||||
|
rm -f example$(EXE) $(OBJEXAMPLE)
|
||||||
|
rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
|
||||||
|
rm -f $(ALLOBJS:%.o=%.d)
|
||||||
|
|
||||||
|
distclean: clean
|
||||||
|
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
|
||||||
|
rm -rf conftest*
|
||||||
|
|
||||||
|
install-cli: cli
|
||||||
|
$(INSTALL) -d $(DESTDIR)$(bindir)
|
||||||
|
$(INSTALL) x264$(EXE) $(DESTDIR)$(bindir)
|
||||||
|
|
||||||
|
install-lib-dev:
|
||||||
|
$(INSTALL) -d $(DESTDIR)$(includedir)
|
||||||
|
$(INSTALL) -d $(DESTDIR)$(libdir)/pkgconfig
|
||||||
|
$(INSTALL) -m 644 $(SRCPATH)/x264.h x264_config.h $(DESTDIR)$(includedir)
|
||||||
|
$(INSTALL) -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
|
||||||
|
|
||||||
|
install-lib-static: lib-static install-lib-dev
|
||||||
|
$(INSTALL) -d $(DESTDIR)$(libdir)
|
||||||
|
$(INSTALL) -m 644 $(LIBX264) $(DESTDIR)$(libdir)
|
||||||
|
$(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264))
|
||||||
|
|
||||||
|
install-lib-shared: lib-shared install-lib-dev
|
||||||
|
$(INSTALL) -d $(DESTDIR)$(libdir)
|
||||||
|
ifneq ($(IMPLIBNAME),)
|
||||||
|
$(INSTALL) -d $(DESTDIR)$(bindir)
|
||||||
|
$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(bindir)
|
||||||
|
$(INSTALL) -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir)
|
||||||
|
else ifneq ($(SONAME),)
|
||||||
|
ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
|
||||||
|
$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(libdir)
|
||||||
|
endif
|
||||||
|
|
||||||
|
install-bashcompletion:
|
||||||
|
ifneq ($(BASHCOMPLETIONSDIR),)
|
||||||
|
$(INSTALL) -d $(DESTDIR)$(BASHCOMPLETIONSDIR)
|
||||||
|
$(INSTALL) -m 644 $(SRCPATH)/tools/bash-autocomplete.sh $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264
|
||||||
|
endif
|
||||||
|
|
||||||
|
uninstall:
|
||||||
|
rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(includedir)/x264_config.h $(DESTDIR)$(libdir)/libx264.a
|
||||||
|
rm -f $(DESTDIR)$(bindir)/x264$(EXE) $(DESTDIR)$(libdir)/pkgconfig/x264.pc
|
||||||
|
ifneq ($(IMPLIBNAME),)
|
||||||
|
rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME)
|
||||||
|
else ifneq ($(SONAME),)
|
||||||
|
rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
|
||||||
|
endif
|
||||||
|
ifneq ($(BASHCOMPLETIONSDIR),)
|
||||||
|
rm -f $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264
|
||||||
|
endif
|
||||||
|
|
||||||
|
etags TAGS:
|
||||||
|
etags $(SRCS) $(SRCS_X) $(SRCS_8)
|
||||||
408
autocomplete.c
Normal file
408
autocomplete.c
Normal file
@@ -0,0 +1,408 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* autocomplete: x264cli shell autocomplete
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2018-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Henrik Gramner <henrik@gramner.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "x264cli.h"
|
||||||
|
#include "input/input.h"
|
||||||
|
|
||||||
|
#if HAVE_LAVF
|
||||||
|
#undef DECLARE_ALIGNED
|
||||||
|
#include <libavformat/avformat.h>
|
||||||
|
#include <libavutil/pixdesc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static const char * const level_names[] =
|
||||||
|
{
|
||||||
|
"1", "1.1", "1.2", "1.3", "1b",
|
||||||
|
"2", "2.1", "2.2",
|
||||||
|
"3", "3.1", "3.2",
|
||||||
|
"4", "4.1", "4.2",
|
||||||
|
"5", "5.1", "5.2",
|
||||||
|
"6", "6.1", "6.2",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Options requiring a value for which we provide suggestions. */
|
||||||
|
static const char * const opts_suggest[] =
|
||||||
|
{
|
||||||
|
"--alternative-transfer",
|
||||||
|
"--aq-mode",
|
||||||
|
"--asm",
|
||||||
|
"--avcintra-class",
|
||||||
|
"--avcintra-flavor",
|
||||||
|
"--b-adapt",
|
||||||
|
"--b-pyramid",
|
||||||
|
"--colormatrix",
|
||||||
|
"--colorprim",
|
||||||
|
"--cqm",
|
||||||
|
"--demuxer",
|
||||||
|
"--direct",
|
||||||
|
"--frame-packing",
|
||||||
|
"--input-csp",
|
||||||
|
"--input-fmt",
|
||||||
|
"--input-range",
|
||||||
|
"--level",
|
||||||
|
"--log-level",
|
||||||
|
"--me",
|
||||||
|
"--muxer",
|
||||||
|
"--nal-hrd",
|
||||||
|
"--output-csp",
|
||||||
|
"--overscan",
|
||||||
|
"--pass", "-p",
|
||||||
|
"--preset",
|
||||||
|
"--profile",
|
||||||
|
"--pulldown",
|
||||||
|
"--range",
|
||||||
|
"--subme", "-m",
|
||||||
|
"--transfer",
|
||||||
|
"--trellis", "-t",
|
||||||
|
"--tune",
|
||||||
|
"--videoformat",
|
||||||
|
"--weightp",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Options requiring a value for which we don't provide suggestions. */
|
||||||
|
static const char * const opts_nosuggest[] =
|
||||||
|
{
|
||||||
|
"--b-bias",
|
||||||
|
"--bframes", "-b",
|
||||||
|
"--deblock", "-f",
|
||||||
|
"--bitrate", "-B",
|
||||||
|
"--chroma-qp-offset",
|
||||||
|
"--chromaloc",
|
||||||
|
"--cplxblur",
|
||||||
|
"--cqm4",
|
||||||
|
"--cqm4i",
|
||||||
|
"--cqm4ic",
|
||||||
|
"--cqm4iy",
|
||||||
|
"--cqm4p",
|
||||||
|
"--cqm4pc",
|
||||||
|
"--cqm4py",
|
||||||
|
"--cqm8",
|
||||||
|
"--cqm8i",
|
||||||
|
"--cqm8p",
|
||||||
|
"--crf",
|
||||||
|
"--crf-max",
|
||||||
|
"--crop-rect",
|
||||||
|
"--deadzone-inter",
|
||||||
|
"--deadzone-intra",
|
||||||
|
"--fps",
|
||||||
|
"--frames",
|
||||||
|
"--input-depth",
|
||||||
|
"--input-res",
|
||||||
|
"--ipratio",
|
||||||
|
"--keyint", "-I",
|
||||||
|
"--lookahead-threads",
|
||||||
|
"--mastering-display",
|
||||||
|
"--cll",
|
||||||
|
"--merange",
|
||||||
|
"--min-keyint", "-i",
|
||||||
|
"--mvrange",
|
||||||
|
"--mvrange-thread",
|
||||||
|
"--nr",
|
||||||
|
"--opencl-device",
|
||||||
|
"--output-depth",
|
||||||
|
"--partitions", "-A",
|
||||||
|
"--pbratio",
|
||||||
|
"--psy-rd",
|
||||||
|
"--qblur",
|
||||||
|
"--qcomp",
|
||||||
|
"--qp", "-q",
|
||||||
|
"--qpmax",
|
||||||
|
"--qpmin",
|
||||||
|
"--qpstep",
|
||||||
|
"--ratetol",
|
||||||
|
"--ref", "-r",
|
||||||
|
"--rc-lookahead",
|
||||||
|
"--sar",
|
||||||
|
"--scenecut",
|
||||||
|
"--seek",
|
||||||
|
"--slices",
|
||||||
|
"--slices-max",
|
||||||
|
"--slice-max-size",
|
||||||
|
"--slice-max-mbs",
|
||||||
|
"--slice-min-mbs",
|
||||||
|
"--sps-id",
|
||||||
|
"--sync-lookahead",
|
||||||
|
"--threads",
|
||||||
|
"--timebase",
|
||||||
|
"--vbv-bufsize",
|
||||||
|
"--vbv-init",
|
||||||
|
"--vbv-maxrate",
|
||||||
|
"--video-filter", "--vf",
|
||||||
|
"--zones",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Options requiring a filename. */
|
||||||
|
static const char * const opts_filename[] =
|
||||||
|
{
|
||||||
|
"--cqmfile",
|
||||||
|
"--dump-yuv",
|
||||||
|
"--index",
|
||||||
|
"--opencl-clbin",
|
||||||
|
"--output", "-o",
|
||||||
|
"--qpfile",
|
||||||
|
"--stats",
|
||||||
|
"--tcfile-in",
|
||||||
|
"--tcfile-out",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Options without an associated value. */
|
||||||
|
static const char * const opts_standalone[] =
|
||||||
|
{
|
||||||
|
"--8x8dct",
|
||||||
|
"--aud",
|
||||||
|
"--bff",
|
||||||
|
"--bluray-compat",
|
||||||
|
"--cabac",
|
||||||
|
"--constrained-intra",
|
||||||
|
"--cpu-independent",
|
||||||
|
"--dts-compress",
|
||||||
|
"--fake-interlaced",
|
||||||
|
"--fast-pskip",
|
||||||
|
"--filler",
|
||||||
|
"--force-cfr",
|
||||||
|
"--mbtree",
|
||||||
|
"--mixed-refs",
|
||||||
|
"--no-8x8dct",
|
||||||
|
"--no-asm",
|
||||||
|
"--no-cabac",
|
||||||
|
"--no-chroma-me",
|
||||||
|
"--no-dct-decimate",
|
||||||
|
"--no-deblock",
|
||||||
|
"--no-fast-pskip",
|
||||||
|
"--no-mbtree",
|
||||||
|
"--no-mixed-refs",
|
||||||
|
"--no-progress",
|
||||||
|
"--no-psy",
|
||||||
|
"--no-scenecut",
|
||||||
|
"--no-weightb",
|
||||||
|
"--non-deterministic",
|
||||||
|
"--open-gop",
|
||||||
|
"--opencl",
|
||||||
|
"--pic-struct",
|
||||||
|
"--psnr",
|
||||||
|
"--quiet",
|
||||||
|
"--sliced-threads",
|
||||||
|
"--slow-firstpass",
|
||||||
|
"--ssim",
|
||||||
|
"--stitchable",
|
||||||
|
"--tff",
|
||||||
|
"--thread-input",
|
||||||
|
"--verbose", "-v",
|
||||||
|
"--weightb",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Options which shouldn't be suggested in combination with other options. */
|
||||||
|
static const char * const opts_special[] =
|
||||||
|
{
|
||||||
|
"--fullhelp",
|
||||||
|
"--help", "-h",
|
||||||
|
"--longhelp",
|
||||||
|
"--version",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
static int list_contains( const char * const *list, const char *s )
|
||||||
|
{
|
||||||
|
if( *s )
|
||||||
|
for( ; *list; list++ )
|
||||||
|
if( !strcmp( *list, s ) )
|
||||||
|
return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void suggest( const char *s, const char *cur, int cur_len )
|
||||||
|
{
|
||||||
|
if( s && *s && !strncmp( s, cur, cur_len ) )
|
||||||
|
printf( "%s ", s );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void suggest_lower( const char *s, const char *cur, int cur_len )
|
||||||
|
{
|
||||||
|
if( s && *s && !strncasecmp( s, cur, cur_len ) )
|
||||||
|
{
|
||||||
|
for( ; *s; s++ )
|
||||||
|
putchar( *s < 'A' || *s > 'Z' ? *s : *s | 0x20 );
|
||||||
|
putchar( ' ' );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void suggest_num_range( int start, int end, const char *cur, int cur_len )
|
||||||
|
{
|
||||||
|
char buf[16];
|
||||||
|
for( int i = start; i <= end; i++ )
|
||||||
|
{
|
||||||
|
snprintf( buf, sizeof( buf ), "%d", i );
|
||||||
|
suggest( buf, cur, cur_len );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HAVE_LAVF
|
||||||
|
/* Suggest each token in a string separated by delimiters. */
|
||||||
|
static void suggest_token( const char *s, int delim, const char *cur, int cur_len )
|
||||||
|
{
|
||||||
|
if( s && *s )
|
||||||
|
{
|
||||||
|
for( const char *tok_end; (tok_end = strchr( s, delim )); s = tok_end + 1 )
|
||||||
|
{
|
||||||
|
int tok_len = tok_end - s;
|
||||||
|
if( tok_len && tok_len >= cur_len && !strncmp( s, cur, cur_len ) )
|
||||||
|
printf( "%.*s ", tok_len, s );
|
||||||
|
}
|
||||||
|
suggest( s, cur, cur_len );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define OPT( opt ) else if( !strcmp( prev, opt ) )
|
||||||
|
#define OPT2( opt1, opt2 ) else if( !strcmp( prev, opt1 ) || !strcmp( prev, opt2 ) )
|
||||||
|
#define OPT_TYPE( type ) list_contains( opts_##type, prev )
|
||||||
|
|
||||||
|
#define suggest( s ) suggest( s, cur, cur_len )
|
||||||
|
#define suggest_lower( s ) suggest_lower( s, cur, cur_len )
|
||||||
|
#define suggest_list( list ) for( const char * const *s = list; *s; s++ ) suggest( *s )
|
||||||
|
#define suggest_num_range( start, end ) suggest_num_range( start, end, cur, cur_len )
|
||||||
|
#define suggest_token( s, delim ) suggest_token( s, delim, cur, cur_len )
|
||||||
|
|
||||||
|
int x264_cli_autocomplete( const char *prev, const char *cur )
|
||||||
|
{
|
||||||
|
int cur_len = strlen( cur );
|
||||||
|
if( 0 );
|
||||||
|
OPT( "--alternative-transfer" )
|
||||||
|
suggest_list( x264_transfer_names );
|
||||||
|
OPT( "--aq-mode" )
|
||||||
|
suggest_num_range( 0, 3 );
|
||||||
|
OPT( "--asm" )
|
||||||
|
for( const x264_cpu_name_t *cpu = x264_cpu_names; cpu->flags; cpu++ )
|
||||||
|
suggest_lower( cpu->name );
|
||||||
|
OPT( "--avcintra-class" )
|
||||||
|
suggest_list( x264_avcintra_class_names );
|
||||||
|
OPT( "--avcintra-flavor" )
|
||||||
|
suggest_list( x264_avcintra_flavor_names );
|
||||||
|
OPT( "--b-adapt" )
|
||||||
|
suggest_num_range( 0, 2 );
|
||||||
|
OPT( "--b-pyramid" )
|
||||||
|
suggest_list( x264_b_pyramid_names );
|
||||||
|
OPT( "--colormatrix" )
|
||||||
|
suggest_list( x264_colmatrix_names );
|
||||||
|
OPT( "--colorprim" )
|
||||||
|
suggest_list( x264_colorprim_names );
|
||||||
|
OPT( "--cqm" )
|
||||||
|
suggest_list( x264_cqm_names );
|
||||||
|
OPT( "--demuxer" )
|
||||||
|
suggest_list( x264_demuxer_names );
|
||||||
|
OPT( "--direct" )
|
||||||
|
suggest_list( x264_direct_pred_names );
|
||||||
|
OPT( "--frame-packing" )
|
||||||
|
suggest_num_range( 0, 7 );
|
||||||
|
OPT( "--input-csp" )
|
||||||
|
{
|
||||||
|
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
|
||||||
|
suggest( x264_cli_csps[i].name );
|
||||||
|
#if HAVE_LAVF
|
||||||
|
for( const AVPixFmtDescriptor *d = NULL; (d = av_pix_fmt_desc_next( d )); )
|
||||||
|
suggest( d->name );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
OPT( "--input-fmt" )
|
||||||
|
{
|
||||||
|
#if HAVE_LAVF
|
||||||
|
void *i = NULL;
|
||||||
|
for( const AVInputFormat *f; (f = av_demuxer_iterate( &i )); )
|
||||||
|
suggest_token( f->name, ',' );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
OPT( "--input-range" )
|
||||||
|
suggest_list( x264_range_names );
|
||||||
|
OPT( "--level" )
|
||||||
|
suggest_list( level_names );
|
||||||
|
OPT( "--log-level" )
|
||||||
|
suggest_list( x264_log_level_names );
|
||||||
|
OPT( "--me" )
|
||||||
|
suggest_list( x264_motion_est_names );
|
||||||
|
OPT( "--muxer" )
|
||||||
|
suggest_list( x264_muxer_names );
|
||||||
|
OPT( "--nal-hrd" )
|
||||||
|
suggest_list( x264_nal_hrd_names );
|
||||||
|
OPT( "--output-csp" )
|
||||||
|
suggest_list( x264_output_csp_names );
|
||||||
|
OPT( "--output-depth" )
|
||||||
|
{
|
||||||
|
#if HAVE_BITDEPTH8
|
||||||
|
suggest( "8" );
|
||||||
|
#endif
|
||||||
|
#if HAVE_BITDEPTH10
|
||||||
|
suggest( "10" );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
OPT( "--overscan" )
|
||||||
|
suggest_list( x264_overscan_names );
|
||||||
|
OPT2( "--partitions", "-A" )
|
||||||
|
suggest_list( x264_partition_names );
|
||||||
|
OPT2( "--pass", "-p" )
|
||||||
|
suggest_num_range( 1, 3 );
|
||||||
|
OPT( "--preset" )
|
||||||
|
suggest_list( x264_preset_names );
|
||||||
|
OPT( "--profile" )
|
||||||
|
suggest_list( x264_valid_profile_names );
|
||||||
|
OPT( "--pulldown" )
|
||||||
|
suggest_list( x264_pulldown_names );
|
||||||
|
OPT( "--range" )
|
||||||
|
suggest_list( x264_range_names );
|
||||||
|
OPT2( "--subme", "-m" )
|
||||||
|
suggest_num_range( 0, 11 );
|
||||||
|
OPT( "--transfer" )
|
||||||
|
suggest_list( x264_transfer_names );
|
||||||
|
OPT2( "--trellis", "-t" )
|
||||||
|
suggest_num_range( 0, 2 );
|
||||||
|
OPT( "--tune" )
|
||||||
|
suggest_list( x264_tune_names );
|
||||||
|
OPT( "--videoformat" )
|
||||||
|
suggest_list( x264_vidformat_names );
|
||||||
|
OPT( "--weightp" )
|
||||||
|
suggest_num_range( 0, 2 );
|
||||||
|
else if( !OPT_TYPE( nosuggest ) && !OPT_TYPE( special ) )
|
||||||
|
{
|
||||||
|
if( OPT_TYPE( filename ) || strncmp( cur, "--", 2 ) )
|
||||||
|
return 1; /* Fall back to default shell filename autocomplete. */
|
||||||
|
|
||||||
|
/* Suggest options. */
|
||||||
|
suggest_list( opts_suggest );
|
||||||
|
suggest_list( opts_nosuggest );
|
||||||
|
suggest_list( opts_filename );
|
||||||
|
suggest_list( opts_standalone );
|
||||||
|
|
||||||
|
/* Only suggest special options if no other options have been specified. */
|
||||||
|
if( !*prev )
|
||||||
|
suggest_list( opts_special );
|
||||||
|
}
|
||||||
|
|
||||||
|
putchar( '\n' );
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
56
common/aarch64/asm-offsets.c
Normal file
56
common/aarch64/asm-offsets.c
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* asm-offsets.c: check asm offsets for aarch64
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2014-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "asm-offsets.h"
|
||||||
|
|
||||||
|
#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
|
||||||
|
|
||||||
|
#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
|
||||||
|
{ \
|
||||||
|
STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
|
||||||
|
{ \
|
||||||
|
STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, p_start, CABAC_P_START);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED);
|
||||||
|
X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE);
|
||||||
|
|
||||||
|
// the aarch64 asm makes following additional assumptions about the x264_cabac_t
|
||||||
|
// memory layout
|
||||||
|
|
||||||
|
X264_CHECK_REL_OFFSET(x264_cabac_t, i_low, int, i_range);
|
||||||
|
X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue, int, i_bytes_outstanding);
|
||||||
39
common/aarch64/asm-offsets.h
Normal file
39
common/aarch64/asm-offsets.h
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* asm-offsets.h: asm offsets for aarch64
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2014-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_ASM_OFFSETS_H
|
||||||
|
#define X264_AARCH64_ASM_OFFSETS_H
|
||||||
|
|
||||||
|
#define CABAC_I_LOW 0x00
|
||||||
|
#define CABAC_I_RANGE 0x04
|
||||||
|
#define CABAC_I_QUEUE 0x08
|
||||||
|
#define CABAC_I_BYTES_OUTSTANDING 0x0c
|
||||||
|
#define CABAC_P_START 0x10
|
||||||
|
#define CABAC_P 0x18
|
||||||
|
#define CABAC_P_END 0x20
|
||||||
|
#define CABAC_F8_BITS_ENCODED 0x30
|
||||||
|
#define CABAC_STATE 0x34
|
||||||
|
|
||||||
|
#endif
|
||||||
291
common/aarch64/asm.S
Normal file
291
common/aarch64/asm.S
Normal file
@@ -0,0 +1,291 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* asm.S: AArch64 utility macros
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2008-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Mans Rullgard <mans@mansr.com>
|
||||||
|
* David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
|
#define GLUE(a, b) a ## b
|
||||||
|
#define JOIN(a, b) GLUE(a, b)
|
||||||
|
|
||||||
|
#ifdef PREFIX
|
||||||
|
# define BASE _x264_
|
||||||
|
# define SYM_PREFIX _
|
||||||
|
#else
|
||||||
|
# define BASE x264_
|
||||||
|
# define SYM_PREFIX
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef BIT_DEPTH
|
||||||
|
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
|
||||||
|
#else
|
||||||
|
# define EXTERN_ASM BASE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define X(s) JOIN(EXTERN_ASM, s)
|
||||||
|
#define X264(s) JOIN(BASE, s)
|
||||||
|
#define EXT(s) JOIN(SYM_PREFIX, s)
|
||||||
|
|
||||||
|
#ifdef __ELF__
|
||||||
|
# define ELF
|
||||||
|
#else
|
||||||
|
# define ELF #
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __MACH__
|
||||||
|
# define MACH
|
||||||
|
#else
|
||||||
|
# define MACH #
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_AS_FUNC
|
||||||
|
# define FUNC
|
||||||
|
#else
|
||||||
|
# define FUNC #
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.arch AS_ARCH_LEVEL
|
||||||
|
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
|
||||||
|
#define ENABLE_DOTPROD .arch_extension dotprod
|
||||||
|
#define DISABLE_DOTPROD .arch_extension nodotprod
|
||||||
|
#else
|
||||||
|
#define ENABLE_DOTPROD
|
||||||
|
#define DISABLE_DOTPROD
|
||||||
|
#endif
|
||||||
|
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
|
||||||
|
#define ENABLE_I8MM .arch_extension i8mm
|
||||||
|
#define DISABLE_I8MM .arch_extension noi8mm
|
||||||
|
#else
|
||||||
|
#define ENABLE_I8MM
|
||||||
|
#define DISABLE_I8MM
|
||||||
|
#endif
|
||||||
|
#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
|
||||||
|
#define ENABLE_SVE .arch_extension sve
|
||||||
|
#define DISABLE_SVE .arch_extension nosve
|
||||||
|
#else
|
||||||
|
#define ENABLE_SVE
|
||||||
|
#define DISABLE_SVE
|
||||||
|
#endif
|
||||||
|
#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
|
||||||
|
#define ENABLE_SVE2 .arch_extension sve2
|
||||||
|
#define DISABLE_SVE2 .arch_extension nosve2
|
||||||
|
#else
|
||||||
|
#define ENABLE_SVE2
|
||||||
|
#define DISABLE_SVE2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* If we do support the .arch_extension directives, disable support for all
|
||||||
|
* the extensions that we may use, in case they were implicitly enabled by
|
||||||
|
* the .arch level. This makes it clear if we try to assemble an instruction
|
||||||
|
* from an unintended extension set; we only allow assmbling such instructions
|
||||||
|
* within regions where we explicitly enable those extensions. */
|
||||||
|
DISABLE_DOTPROD
|
||||||
|
DISABLE_I8MM
|
||||||
|
DISABLE_SVE
|
||||||
|
DISABLE_SVE2
|
||||||
|
|
||||||
|
.macro function name, export=0, align=2
|
||||||
|
.macro endfunc
|
||||||
|
.if \export
|
||||||
|
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
|
||||||
|
.else
|
||||||
|
ELF .size \name, . - \name
|
||||||
|
.endif
|
||||||
|
FUNC .endfunc
|
||||||
|
.purgem endfunc
|
||||||
|
.endm
|
||||||
|
.text
|
||||||
|
.align \align
|
||||||
|
.if \export
|
||||||
|
.global EXTERN_ASM\name
|
||||||
|
ELF .type EXTERN_ASM\name, %function
|
||||||
|
FUNC .func EXTERN_ASM\name
|
||||||
|
EXTERN_ASM\name:
|
||||||
|
.else
|
||||||
|
ELF .type \name, %function
|
||||||
|
FUNC .func \name
|
||||||
|
\name:
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro const name, align=2
|
||||||
|
.macro endconst
|
||||||
|
ELF .size \name, . - \name
|
||||||
|
.purgem endconst
|
||||||
|
.endm
|
||||||
|
ELF .section .rodata
|
||||||
|
MACH .const_data
|
||||||
|
.align \align
|
||||||
|
\name:
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro movrel rd, val, offset=0
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
.if \offset < 0
|
||||||
|
adrp \rd, \val@PAGE
|
||||||
|
add \rd, \rd, \val@PAGEOFF
|
||||||
|
sub \rd, \rd, -(\offset)
|
||||||
|
.else
|
||||||
|
adrp \rd, \val+(\offset)@PAGE
|
||||||
|
add \rd, \rd, \val+(\offset)@PAGEOFF
|
||||||
|
.endif
|
||||||
|
#elif defined(PIC) && defined(_WIN32)
|
||||||
|
.if \offset < 0
|
||||||
|
adrp \rd, \val
|
||||||
|
add \rd, \rd, :lo12:\val
|
||||||
|
sub \rd, \rd, -(\offset)
|
||||||
|
.else
|
||||||
|
adrp \rd, \val+(\offset)
|
||||||
|
add \rd, \rd, :lo12:\val+(\offset)
|
||||||
|
.endif
|
||||||
|
#elif defined(PIC)
|
||||||
|
adrp \rd, \val+(\offset)
|
||||||
|
add \rd, \rd, :lo12:\val+(\offset)
|
||||||
|
#else
|
||||||
|
ldr \rd, =\val+\offset
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#define FDEC_STRIDE 32
|
||||||
|
#define FENC_STRIDE 16
|
||||||
|
|
||||||
|
|
||||||
|
.macro SUMSUB_AB sum, sub, a, b
|
||||||
|
add \sum, \a, \b
|
||||||
|
sub \sub, \a, \b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro unzip t1, t2, s1, s2
|
||||||
|
uzp1 \t1, \s1, \s2
|
||||||
|
uzp2 \t2, \s1, \s2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro transpose t1, t2, s1, s2
|
||||||
|
trn1 \t1, \s1, \s2
|
||||||
|
trn2 \t2, \s1, \s2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
|
||||||
|
transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s
|
||||||
|
transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s
|
||||||
|
transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h
|
||||||
|
transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
|
||||||
|
transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s
|
||||||
|
transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s
|
||||||
|
transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h
|
||||||
|
transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||||
|
trn1 \r8\().8h, \r0\().8h, \r1\().8h
|
||||||
|
trn2 \r9\().8h, \r0\().8h, \r1\().8h
|
||||||
|
trn1 \r1\().8h, \r2\().8h, \r3\().8h
|
||||||
|
trn2 \r3\().8h, \r2\().8h, \r3\().8h
|
||||||
|
trn1 \r0\().8h, \r4\().8h, \r5\().8h
|
||||||
|
trn2 \r5\().8h, \r4\().8h, \r5\().8h
|
||||||
|
trn1 \r2\().8h, \r6\().8h, \r7\().8h
|
||||||
|
trn2 \r7\().8h, \r6\().8h, \r7\().8h
|
||||||
|
|
||||||
|
trn1 \r4\().4s, \r0\().4s, \r2\().4s
|
||||||
|
trn2 \r2\().4s, \r0\().4s, \r2\().4s
|
||||||
|
trn1 \r6\().4s, \r5\().4s, \r7\().4s
|
||||||
|
trn2 \r7\().4s, \r5\().4s, \r7\().4s
|
||||||
|
trn1 \r5\().4s, \r9\().4s, \r3\().4s
|
||||||
|
trn2 \r9\().4s, \r9\().4s, \r3\().4s
|
||||||
|
trn1 \r3\().4s, \r8\().4s, \r1\().4s
|
||||||
|
trn2 \r8\().4s, \r8\().4s, \r1\().4s
|
||||||
|
|
||||||
|
trn1 \r0\().2d, \r3\().2d, \r4\().2d
|
||||||
|
trn2 \r4\().2d, \r3\().2d, \r4\().2d
|
||||||
|
|
||||||
|
trn1 \r1\().2d, \r5\().2d, \r6\().2d
|
||||||
|
trn2 \r5\().2d, \r5\().2d, \r6\().2d
|
||||||
|
|
||||||
|
trn2 \r6\().2d, \r8\().2d, \r2\().2d
|
||||||
|
trn1 \r2\().2d, \r8\().2d, \r2\().2d
|
||||||
|
|
||||||
|
trn1 \r3\().2d, \r9\().2d, \r7\().2d
|
||||||
|
trn2 \r7\().2d, \r9\().2d, \r7\().2d
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
||||||
|
trn1 \t0\().16b, \r0\().16b, \r1\().16b
|
||||||
|
trn2 \t1\().16b, \r0\().16b, \r1\().16b
|
||||||
|
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
||||||
|
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
||||||
|
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
||||||
|
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
||||||
|
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
||||||
|
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
||||||
|
|
||||||
|
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
||||||
|
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
||||||
|
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
||||||
|
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
||||||
|
trn1 \r5\().8h, \t1\().8h, \r3\().8h
|
||||||
|
trn2 \t1\().8h, \t1\().8h, \r3\().8h
|
||||||
|
trn1 \r3\().8h, \t0\().8h, \r1\().8h
|
||||||
|
trn2 \t0\().8h, \t0\().8h, \r1\().8h
|
||||||
|
|
||||||
|
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
||||||
|
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
||||||
|
|
||||||
|
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
||||||
|
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
||||||
|
|
||||||
|
trn2 \r6\().4s, \t0\().4s, \r2\().4s
|
||||||
|
trn1 \r2\().4s, \t0\().4s, \r2\().4s
|
||||||
|
|
||||||
|
trn1 \r3\().4s, \t1\().4s, \r7\().4s
|
||||||
|
trn2 \r7\().4s, \t1\().4s, \r7\().4s
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
|
||||||
|
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
||||||
|
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
||||||
|
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
||||||
|
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
||||||
|
|
||||||
|
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
||||||
|
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
||||||
|
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
||||||
|
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7
|
||||||
|
trn1 \t4\().8b, \r0\().8b, \r1\().8b
|
||||||
|
trn2 \t5\().8b, \r0\().8b, \r1\().8b
|
||||||
|
trn1 \t6\().8b, \r2\().8b, \r3\().8b
|
||||||
|
trn2 \t7\().8b, \r2\().8b, \r3\().8b
|
||||||
|
|
||||||
|
trn1 \r0\().4h, \t4\().4h, \t6\().4h
|
||||||
|
trn2 \r2\().4h, \t4\().4h, \t6\().4h
|
||||||
|
trn1 \r1\().4h, \t5\().4h, \t7\().4h
|
||||||
|
trn2 \r3\().4h, \t5\().4h, \t7\().4h
|
||||||
|
.endm
|
||||||
82
common/aarch64/bitstream-a.S
Normal file
82
common/aarch64/bitstream-a.S
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* bitstream-a.S: aarch64 bitstream functions
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2014-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
function nal_escape_neon, export=1
|
||||||
|
movi v0.16b, #0xff
|
||||||
|
movi v4.16b, #4
|
||||||
|
mov w3, #3
|
||||||
|
subs x6, x1, x2
|
||||||
|
cbz x6, 99f
|
||||||
|
0:
|
||||||
|
cmn x6, #15
|
||||||
|
b.lt 16f
|
||||||
|
mov x1, x2
|
||||||
|
b 100f
|
||||||
|
16:
|
||||||
|
ld1 {v1.16b}, [x1], #16
|
||||||
|
ext v2.16b, v0.16b, v1.16b, #14
|
||||||
|
ext v3.16b, v0.16b, v1.16b, #15
|
||||||
|
cmhi v7.16b, v4.16b, v1.16b
|
||||||
|
cmeq v5.16b, v2.16b, #0
|
||||||
|
cmeq v6.16b, v3.16b, #0
|
||||||
|
and v5.16b, v5.16b, v7.16b
|
||||||
|
and v5.16b, v5.16b, v6.16b
|
||||||
|
shrn v7.8b, v5.8h, #4
|
||||||
|
mov x7, v7.d[0]
|
||||||
|
cbz x7, 16f
|
||||||
|
mov x6, #-16
|
||||||
|
100:
|
||||||
|
umov w5, v0.b[14]
|
||||||
|
umov w4, v0.b[15]
|
||||||
|
orr w5, w4, w5, lsl #8
|
||||||
|
101:
|
||||||
|
ldrb w4, [x1, x6]
|
||||||
|
orr w9, w4, w5, lsl #16
|
||||||
|
cmp w9, #3
|
||||||
|
b.hi 102f
|
||||||
|
strb w3, [x0], #1
|
||||||
|
orr w5, w3, w5, lsl #8
|
||||||
|
102:
|
||||||
|
adds x6, x6, #1
|
||||||
|
strb w4, [x0], #1
|
||||||
|
orr w5, w4, w5, lsl #8
|
||||||
|
b.lt 101b
|
||||||
|
subs x6, x1, x2
|
||||||
|
lsr w9, w5, #8
|
||||||
|
mov v0.b[14], w9
|
||||||
|
mov v0.b[15], w5
|
||||||
|
b.lt 0b
|
||||||
|
|
||||||
|
ret
|
||||||
|
16:
|
||||||
|
subs x6, x1, x2
|
||||||
|
st1 {v1.16b}, [x0], #16
|
||||||
|
mov v0.16b, v1.16b
|
||||||
|
b.lt 0b
|
||||||
|
99:
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
32
common/aarch64/bitstream.h
Normal file
32
common/aarch64/bitstream.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* bitstream.h: aarch64 bitstream functions
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2017-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_BITSTREAM_H
|
||||||
|
#define X264_AARCH64_BITSTREAM_H
|
||||||
|
|
||||||
|
#define x264_nal_escape_neon x264_template(nal_escape_neon)
|
||||||
|
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||||
|
|
||||||
|
#endif
|
||||||
131
common/aarch64/cabac-a.S
Normal file
131
common/aarch64/cabac-a.S
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* cabac-a.S: aarch64 cabac
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2014-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "asm-offsets.h"
|
||||||
|
|
||||||
|
// w11 holds x264_cabac_t.i_low
|
||||||
|
// w12 holds x264_cabac_t.i_range
|
||||||
|
|
||||||
|
function cabac_encode_decision_asm, export=1
|
||||||
|
add w10, w1, #CABAC_STATE
|
||||||
|
ldrb w3, [x0, w10, uxtw] // i_state
|
||||||
|
ldr w12, [x0, #CABAC_I_RANGE]
|
||||||
|
movrel x8, X264(cabac_range_lps), -4
|
||||||
|
movrel x9, X264(cabac_transition)
|
||||||
|
ubfx x4, x3, #1, #7
|
||||||
|
asr w5, w12, #6
|
||||||
|
add x8, x8, x4, lsl #2
|
||||||
|
orr w14, w2, w3, lsl #1
|
||||||
|
ldrb w4, [x8, w5, uxtw] // i_range_lps
|
||||||
|
ldr w11, [x0, #CABAC_I_LOW]
|
||||||
|
eor w6, w2, w3 // b ^ i_state
|
||||||
|
ldrb w9, [x9, w14, uxtw]
|
||||||
|
sub w12, w12, w4
|
||||||
|
add w7, w11, w12
|
||||||
|
tst w6, #1 // (b ^ i_state) & 1
|
||||||
|
csel w12, w4, w12, ne
|
||||||
|
csel w11, w7, w11, ne
|
||||||
|
strb w9, [x0, w10, uxtw] // i_state
|
||||||
|
|
||||||
|
cabac_encode_renorm:
|
||||||
|
ldr w2, [x0, #CABAC_I_QUEUE]
|
||||||
|
clz w5, w12
|
||||||
|
sub w5, w5, #23
|
||||||
|
lsl w11, w11, w5
|
||||||
|
lsl w12, w12, w5
|
||||||
|
adds w2, w2, w5
|
||||||
|
b.ge cabac_putbyte
|
||||||
|
|
||||||
|
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
|
||||||
|
str w2, [x0, #CABAC_I_QUEUE]
|
||||||
|
ret
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
cabac_putbyte:
|
||||||
|
ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
|
||||||
|
add w14, w2, #10
|
||||||
|
mov w13, #-1
|
||||||
|
sub w2, w2, #8
|
||||||
|
asr w4, w11, w14 // out
|
||||||
|
lsl w13, w13, w14
|
||||||
|
subs w5, w4, #0xff
|
||||||
|
bic w11, w11, w13
|
||||||
|
cinc w6, w6, eq
|
||||||
|
b.eq 0f
|
||||||
|
|
||||||
|
1:
|
||||||
|
ldr x7, [x0, #CABAC_P]
|
||||||
|
asr w5, w4, #8 // carry
|
||||||
|
ldurb w8, [x7, #-1]
|
||||||
|
add w8, w8, w5
|
||||||
|
sub w5, w5, #1
|
||||||
|
sturb w8, [x7, #-1]
|
||||||
|
cbz w6, 3f
|
||||||
|
2:
|
||||||
|
subs w6, w6, #1
|
||||||
|
strb w5, [x7], #1
|
||||||
|
b.gt 2b
|
||||||
|
3:
|
||||||
|
strb w4, [x7], #1
|
||||||
|
str x7, [x0, #CABAC_P]
|
||||||
|
0:
|
||||||
|
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
|
||||||
|
stp w2, w6, [x0, #CABAC_I_QUEUE] // store i_queue, i_bytes_outstanding
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function cabac_encode_bypass_asm, export=1, align=5
|
||||||
|
ldr w12, [x0, #CABAC_I_RANGE]
|
||||||
|
ldr w11, [x0, #CABAC_I_LOW]
|
||||||
|
ldr w2, [x0, #CABAC_I_QUEUE]
|
||||||
|
and w1, w1, w12
|
||||||
|
add w11, w1, w11, lsl #1
|
||||||
|
adds w2, w2, #1
|
||||||
|
b.ge cabac_putbyte
|
||||||
|
str w11, [x0, #CABAC_I_LOW]
|
||||||
|
str w2, [x0, #CABAC_I_QUEUE]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function cabac_encode_terminal_asm, export=1, align=5
|
||||||
|
ldr w12, [x0, #CABAC_I_RANGE]
|
||||||
|
sub w12, w12, #2
|
||||||
|
tbz w12, #8, 1f
|
||||||
|
|
||||||
|
str w12, [x0, #CABAC_I_RANGE]
|
||||||
|
ret
|
||||||
|
1:
|
||||||
|
ldr w2, [x0, #CABAC_I_QUEUE]
|
||||||
|
ldr w11, [x0, #CABAC_I_LOW]
|
||||||
|
lsl w12, w12, #1
|
||||||
|
adds w2, w2, #1
|
||||||
|
lsl w11, w11, #1
|
||||||
|
b.ge cabac_putbyte
|
||||||
|
|
||||||
|
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
|
||||||
|
str w2, [x0, #CABAC_I_QUEUE]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
40
common/aarch64/dct-a-common.S
Normal file
40
common/aarch64/dct-a-common.S
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* dct-a-common.S: aarch64 transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
* David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
// This file contains the NEON macros that are intended to be used by
|
||||||
|
// the SVE/SVE2 functions as well
|
||||||
|
|
||||||
|
.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
|
||||||
|
SUMSUB_AB \v1, \v6, \v5, \v6
|
||||||
|
SUMSUB_AB \v3, \v7, \v4, \v7
|
||||||
|
add \v0, \v3, \v1
|
||||||
|
add \v4, \v7, \v7
|
||||||
|
add \v5, \v6, \v6
|
||||||
|
sub \v2, \v3, \v1
|
||||||
|
add \v1, \v4, \v6
|
||||||
|
sub \v3, \v7, \v5
|
||||||
|
.endm
|
||||||
88
common/aarch64/dct-a-sve.S
Normal file
88
common/aarch64/dct-a-sve.S
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* dct-a-sve.S: aarch64 transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "dct-a-common.S"
|
||||||
|
|
||||||
|
ENABLE_SVE
|
||||||
|
|
||||||
|
function sub4x4_dct_sve, export=1
|
||||||
|
mov x3, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
ptrue p0.h, vl4
|
||||||
|
ld1b {z0.h}, p0/z, [x1]
|
||||||
|
add x1, x1, x3
|
||||||
|
ld1b {z1.h}, p0/z, [x2]
|
||||||
|
add x2, x2, x4
|
||||||
|
ld1b {z2.h}, p0/z, [x1]
|
||||||
|
add x1, x1, x3
|
||||||
|
sub v16.4h, v0.4h, v1.4h
|
||||||
|
ld1b {z3.h}, p0/z, [x2]
|
||||||
|
add x2, x2, x4
|
||||||
|
ld1b {z4.h}, p0/z, [x1]
|
||||||
|
add x1, x1, x3
|
||||||
|
sub v17.4h, v2.4h, v3.4h
|
||||||
|
ld1b {z5.h}, p0/z, [x2]
|
||||||
|
add x2, x2, x4
|
||||||
|
ld1b {z6.h}, p0/z, [x1]
|
||||||
|
sub v18.4h, v4.4h, v5.4h
|
||||||
|
ld1b {z7.h}, p0/z, [x2]
|
||||||
|
sub v19.4h, v6.4h, v7.4h
|
||||||
|
|
||||||
|
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
|
||||||
|
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||||
|
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
|
||||||
|
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function zigzag_interleave_8x8_cavlc_sve, export=1
|
||||||
|
mov z31.s, #1
|
||||||
|
ptrue p2.s, vl2
|
||||||
|
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
|
||||||
|
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
|
||||||
|
umax v16.8h, v0.8h, v4.8h
|
||||||
|
umax v17.8h, v1.8h, v5.8h
|
||||||
|
umax v18.8h, v2.8h, v6.8h
|
||||||
|
umax v19.8h, v3.8h, v7.8h
|
||||||
|
st1 {v0.8h}, [x0], #16
|
||||||
|
st1 {v4.8h}, [x0], #16
|
||||||
|
umaxp v16.8h, v16.8h, v17.8h
|
||||||
|
umaxp v18.8h, v18.8h, v19.8h
|
||||||
|
st1 {v1.8h}, [x0], #16
|
||||||
|
st1 {v5.8h}, [x0], #16
|
||||||
|
umaxp v16.8h, v16.8h, v18.8h
|
||||||
|
st1 {v2.8h}, [x0], #16
|
||||||
|
st1 {v6.8h}, [x0], #16
|
||||||
|
cmhs v16.4s, v16.4s, v31.4s
|
||||||
|
st1 {v3.8h}, [x0], #16
|
||||||
|
and v16.16b, v16.16b, v31.16b
|
||||||
|
st1 {v7.8h}, [x0], #16
|
||||||
|
st1b {z16.s}, p2, [x2]
|
||||||
|
add x2, x2, #8
|
||||||
|
mov v16.d[0], v16.d[1]
|
||||||
|
st1b {z16.s}, p2, [x2]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
90
common/aarch64/dct-a-sve2.S
Normal file
90
common/aarch64/dct-a-sve2.S
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* dct-a-sve2.S: aarch64 transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "dct-a-common.S"
|
||||||
|
|
||||||
|
ENABLE_SVE
|
||||||
|
ENABLE_SVE2
|
||||||
|
|
||||||
|
function add4x4_idct_sve2, export=1
|
||||||
|
mov x2, #FDEC_STRIDE
|
||||||
|
mov x11, x0
|
||||||
|
ptrue p0.h, vl8
|
||||||
|
ptrue p1.h, vl4
|
||||||
|
ld1 {v0.8h, v1.8h}, [x1]
|
||||||
|
|
||||||
|
SUMSUB_AB v4.8h, v5.8h, v0.8h, v1.8h
|
||||||
|
|
||||||
|
sshr v7.8h, v0.8h, #1
|
||||||
|
sshr v6.8h, v1.8h, #1
|
||||||
|
sub v7.8h, v7.8h, v1.8h
|
||||||
|
add v6.8h, v6.8h, v0.8h
|
||||||
|
mov v7.d[0], v7.d[1]
|
||||||
|
mov v6.d[0], v6.d[1]
|
||||||
|
ld1b {z28.h}, p0/z, [x11]
|
||||||
|
add x11, x11, x2
|
||||||
|
SUMSUB_AB v0.8h, v2.8h, v4.8h, v6.8h
|
||||||
|
SUMSUB_AB v1.8h, v3.8h, v5.8h, v7.8h
|
||||||
|
|
||||||
|
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
|
||||||
|
|
||||||
|
SUMSUB_AB v4.4h, v5.4h, v0.4h, v3.4h
|
||||||
|
|
||||||
|
sshr v7.4h, v1.4h, #1
|
||||||
|
sshr v6.4h, v2.4h, #1
|
||||||
|
sub v7.4h, v7.4h, v2.4h
|
||||||
|
add v6.4h, v6.4h, v1.4h
|
||||||
|
ld1b {z29.h}, p0/z, [x11]
|
||||||
|
add x11, x11, x2
|
||||||
|
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||||
|
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||||
|
|
||||||
|
srshr z0.h, p1/m, z0.h, #6
|
||||||
|
srshr z1.h, p1/m, z1.h, #6
|
||||||
|
ld1b {z31.h}, p0/z, [x11]
|
||||||
|
add x11, x11, x2
|
||||||
|
srshr z2.h, p1/m, z2.h, #6
|
||||||
|
srshr z3.h, p1/m, z3.h, #6
|
||||||
|
ld1b {z30.h}, p0/z, [x11]
|
||||||
|
|
||||||
|
add v0.8h, v0.8h, v28.8h
|
||||||
|
add v1.8h, v1.8h, v29.8h
|
||||||
|
add v2.8h, v2.8h, v30.8h
|
||||||
|
add v3.8h, v3.8h, v31.8h
|
||||||
|
sqxtunb z0.b, z0.h
|
||||||
|
sqxtunb z1.b, z1.h
|
||||||
|
sqxtunb z2.b, z2.h
|
||||||
|
sqxtunb z3.b, z3.h
|
||||||
|
|
||||||
|
st1b {z0.h}, p1, [x0]
|
||||||
|
add x0, x0, x2
|
||||||
|
st1b {z1.h}, p1, [x0]
|
||||||
|
add x0, x0, x2
|
||||||
|
st1b {z3.h}, p1, [x0]
|
||||||
|
add x0, x0, x2
|
||||||
|
st1b {z2.h}, p1, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
998
common/aarch64/dct-a.S
Normal file
998
common/aarch64/dct-a.S
Normal file
@@ -0,0 +1,998 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* dct-a.S: aarch64 transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "dct-a-common.S"
|
||||||
|
|
||||||
|
const scan4x4_frame, align=4
|
||||||
|
.byte 0,1, 8,9, 2,3, 4,5
|
||||||
|
.byte 10,11, 16,17, 24,25, 18,19
|
||||||
|
.byte 12,13, 6,7, 14,15, 20,21
|
||||||
|
.byte 26,27, 28,29, 22,23, 30,31
|
||||||
|
endconst
|
||||||
|
|
||||||
|
const scan4x4_field, align=4
|
||||||
|
.byte 0,1, 2,3, 8,9, 4,5
|
||||||
|
.byte 6,7, 10,11, 12,13, 14,15
|
||||||
|
endconst
|
||||||
|
|
||||||
|
const sub4x4_frame, align=4
|
||||||
|
.byte 0, 1, 4, 8
|
||||||
|
.byte 5, 2, 3, 6
|
||||||
|
.byte 9, 12, 13, 10
|
||||||
|
.byte 7, 11, 14, 15
|
||||||
|
endconst
|
||||||
|
|
||||||
|
const sub4x4_field, align=4
|
||||||
|
.byte 0, 4, 1, 8
|
||||||
|
.byte 12, 5, 9, 13
|
||||||
|
.byte 2, 6, 10, 14
|
||||||
|
.byte 3, 7, 11, 15
|
||||||
|
endconst
|
||||||
|
|
||||||
|
// sum = a + (b>>shift) sub = (a>>shift) - b
|
||||||
|
.macro SUMSUB_SHR shift sum sub a b t0 t1
|
||||||
|
sshr \t0, \b, #\shift
|
||||||
|
sshr \t1, \a, #\shift
|
||||||
|
add \sum, \a, \t0
|
||||||
|
sub \sub, \t1, \b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// sum = (a>>shift) + b sub = a - (b>>shift)
|
||||||
|
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
|
||||||
|
sshr \t0, \a, #\shift
|
||||||
|
sshr \t1, \b, #\shift
|
||||||
|
add \sum, \t0, \b
|
||||||
|
sub \sub, \a, \t1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// a += 1.5*ma b -= 1.5*mb
|
||||||
|
.macro SUMSUB_15 a b ma mb t0 t1
|
||||||
|
sshr \t0, \ma, #1
|
||||||
|
sshr \t1, \mb, #1
|
||||||
|
add \t0, \t0, \ma
|
||||||
|
add \t1, \t1, \mb
|
||||||
|
add \a, \a, \t0
|
||||||
|
sub \b, \b, \t1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
function dct4x4dc_neon, export=1
|
||||||
|
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||||
|
movi v31.4h, #1
|
||||||
|
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
|
||||||
|
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
|
||||||
|
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||||
|
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
|
||||||
|
transpose v4.4h, v6.4h, v0.4h, v2.4h
|
||||||
|
transpose v5.4h, v7.4h, v1.4h, v3.4h
|
||||||
|
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||||
|
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||||
|
transpose v4.2s, v5.2s, v0.2s, v1.2s
|
||||||
|
transpose v6.2s, v7.2s, v2.2s, v3.2s
|
||||||
|
add v16.4h, v4.4h, v31.4h
|
||||||
|
add v17.4h, v6.4h, v31.4h
|
||||||
|
srhadd v0.4h, v4.4h, v5.4h
|
||||||
|
shsub v1.4h, v16.4h, v5.4h
|
||||||
|
shsub v2.4h, v17.4h, v7.4h
|
||||||
|
srhadd v3.4h, v6.4h, v7.4h
|
||||||
|
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function idct4x4dc_neon, export=1
|
||||||
|
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||||
|
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
|
||||||
|
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
|
||||||
|
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||||
|
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
|
||||||
|
transpose v4.4h, v6.4h, v0.4h, v2.4h
|
||||||
|
transpose v5.4h, v7.4h, v1.4h, v3.4h
|
||||||
|
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||||
|
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||||
|
transpose v4.2s, v5.2s, v0.2s, v1.2s
|
||||||
|
transpose v6.2s, v7.2s, v2.2s, v3.2s
|
||||||
|
SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h
|
||||||
|
SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h
|
||||||
|
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub4x4_dct_neon, export=1
|
||||||
|
mov x3, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
ld1 {v0.s}[0], [x1], x3
|
||||||
|
ld1 {v1.s}[0], [x2], x4
|
||||||
|
ld1 {v2.s}[0], [x1], x3
|
||||||
|
usubl v16.8h, v0.8b, v1.8b
|
||||||
|
ld1 {v3.s}[0], [x2], x4
|
||||||
|
ld1 {v4.s}[0], [x1], x3
|
||||||
|
usubl v17.8h, v2.8b, v3.8b
|
||||||
|
ld1 {v5.s}[0], [x2], x4
|
||||||
|
ld1 {v6.s}[0], [x1], x3
|
||||||
|
usubl v18.8h, v4.8b, v5.8b
|
||||||
|
ld1 {v7.s}[0], [x2], x4
|
||||||
|
usubl v19.8h, v6.8b, v7.8b
|
||||||
|
|
||||||
|
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
|
||||||
|
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||||
|
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
|
||||||
|
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub8x4_dct_neon
|
||||||
|
ld1 {v0.8b}, [x1], x3
|
||||||
|
ld1 {v1.8b}, [x2], x4
|
||||||
|
usubl v16.8h, v0.8b, v1.8b
|
||||||
|
ld1 {v2.8b}, [x1], x3
|
||||||
|
ld1 {v3.8b}, [x2], x4
|
||||||
|
usubl v17.8h, v2.8b, v3.8b
|
||||||
|
ld1 {v4.8b}, [x1], x3
|
||||||
|
ld1 {v5.8b}, [x2], x4
|
||||||
|
usubl v18.8h, v4.8b, v5.8b
|
||||||
|
ld1 {v6.8b}, [x1], x3
|
||||||
|
ld1 {v7.8b}, [x2], x4
|
||||||
|
usubl v19.8h, v6.8b, v7.8b
|
||||||
|
|
||||||
|
DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
|
||||||
|
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||||
|
|
||||||
|
SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h
|
||||||
|
SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h
|
||||||
|
add v22.8h, v19.8h, v19.8h
|
||||||
|
add v21.8h, v18.8h, v18.8h
|
||||||
|
add v0.8h, v16.8h, v17.8h
|
||||||
|
sub v1.8h, v16.8h, v17.8h
|
||||||
|
|
||||||
|
add v2.8h, v22.8h, v18.8h
|
||||||
|
sub v3.8h, v19.8h, v21.8h
|
||||||
|
|
||||||
|
zip1 v4.2d, v0.2d, v2.2d
|
||||||
|
zip2 v6.2d, v0.2d, v2.2d
|
||||||
|
zip1 v5.2d, v1.2d, v3.2d
|
||||||
|
zip2 v7.2d, v1.2d, v3.2d
|
||||||
|
|
||||||
|
st1 {v4.8h}, [x0], #16
|
||||||
|
st1 {v5.8h}, [x0], #16
|
||||||
|
st1 {v6.8h}, [x0], #16
|
||||||
|
st1 {v7.8h}, [x0], #16
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub8x8_dct_neon, export=1
|
||||||
|
mov x5, x30
|
||||||
|
mov x3, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
mov x30, x5
|
||||||
|
b sub8x4_dct_neon
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub16x16_dct_neon, export=1
|
||||||
|
mov x5, x30
|
||||||
|
mov x3, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
sub x1, x1, #8*FENC_STRIDE-8
|
||||||
|
sub x2, x2, #8*FDEC_STRIDE-8
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
sub x1, x1, #8
|
||||||
|
sub x2, x2, #8
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
sub x1, x1, #8*FENC_STRIDE-8
|
||||||
|
sub x2, x2, #8*FDEC_STRIDE-8
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
mov x30, x5
|
||||||
|
b sub8x4_dct_neon
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
.macro DCT8_1D type
|
||||||
|
SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34
|
||||||
|
SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25
|
||||||
|
SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16
|
||||||
|
SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07
|
||||||
|
|
||||||
|
SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2
|
||||||
|
SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3
|
||||||
|
|
||||||
|
SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5
|
||||||
|
sshr v23.8h, v21.8h, #1
|
||||||
|
sshr v18.8h, v16.8h, #1
|
||||||
|
add v23.8h, v23.8h, v21.8h
|
||||||
|
add v18.8h, v18.8h, v16.8h
|
||||||
|
sub v30.8h, v30.8h, v23.8h
|
||||||
|
sub v29.8h, v29.8h, v18.8h
|
||||||
|
|
||||||
|
SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7
|
||||||
|
sshr v22.8h, v20.8h, #1
|
||||||
|
sshr v19.8h, v17.8h, #1
|
||||||
|
add v22.8h, v22.8h, v20.8h
|
||||||
|
add v19.8h, v19.8h, v17.8h
|
||||||
|
add v22.8h, v28.8h, v22.8h
|
||||||
|
add v31.8h, v31.8h, v19.8h
|
||||||
|
|
||||||
|
SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h
|
||||||
|
SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h
|
||||||
|
SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h
|
||||||
|
SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function sub8x8_dct8_neon, export=1
|
||||||
|
mov x3, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
ld1 {v16.8b}, [x1], x3
|
||||||
|
ld1 {v17.8b}, [x2], x4
|
||||||
|
ld1 {v18.8b}, [x1], x3
|
||||||
|
ld1 {v19.8b}, [x2], x4
|
||||||
|
usubl v0.8h, v16.8b, v17.8b
|
||||||
|
ld1 {v20.8b}, [x1], x3
|
||||||
|
ld1 {v21.8b}, [x2], x4
|
||||||
|
usubl v1.8h, v18.8b, v19.8b
|
||||||
|
ld1 {v22.8b}, [x1], x3
|
||||||
|
ld1 {v23.8b}, [x2], x4
|
||||||
|
usubl v2.8h, v20.8b, v21.8b
|
||||||
|
ld1 {v24.8b}, [x1], x3
|
||||||
|
ld1 {v25.8b}, [x2], x4
|
||||||
|
usubl v3.8h, v22.8b, v23.8b
|
||||||
|
ld1 {v26.8b}, [x1], x3
|
||||||
|
ld1 {v27.8b}, [x2], x4
|
||||||
|
usubl v4.8h, v24.8b, v25.8b
|
||||||
|
ld1 {v28.8b}, [x1], x3
|
||||||
|
ld1 {v29.8b}, [x2], x4
|
||||||
|
usubl v5.8h, v26.8b, v27.8b
|
||||||
|
ld1 {v30.8b}, [x1], x3
|
||||||
|
ld1 {v31.8b}, [x2], x4
|
||||||
|
usubl v6.8h, v28.8b, v29.8b
|
||||||
|
usubl v7.8h, v30.8b, v31.8b
|
||||||
|
|
||||||
|
DCT8_1D row
|
||||||
|
transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||||
|
DCT8_1D col
|
||||||
|
|
||||||
|
st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
|
||||||
|
st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub16x16_dct8_neon, export=1
|
||||||
|
mov x7, x30
|
||||||
|
bl X(sub8x8_dct8_neon)
|
||||||
|
sub x1, x1, #FENC_STRIDE*8 - 8
|
||||||
|
sub x2, x2, #FDEC_STRIDE*8 - 8
|
||||||
|
bl X(sub8x8_dct8_neon)
|
||||||
|
sub x1, x1, #8
|
||||||
|
sub x2, x2, #8
|
||||||
|
bl X(sub8x8_dct8_neon)
|
||||||
|
mov x30, x7
|
||||||
|
sub x1, x1, #FENC_STRIDE*8 - 8
|
||||||
|
sub x2, x2, #FDEC_STRIDE*8 - 8
|
||||||
|
b X(sub8x8_dct8_neon)
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
// First part of IDCT (minus final SUMSUB_BA)
|
||||||
|
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
|
||||||
|
SUMSUB_AB \d4, \d5, \d0, \d2
|
||||||
|
sshr \d7, \d1, #1
|
||||||
|
sshr \d6, \d3, #1
|
||||||
|
sub \d7, \d7, \d3
|
||||||
|
add \d6, \d6, \d1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function add4x4_idct_neon, export=1
|
||||||
|
mov x2, #FDEC_STRIDE
|
||||||
|
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
|
||||||
|
|
||||||
|
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
|
||||||
|
ld1 {v28.s}[0], [x0], x2
|
||||||
|
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||||
|
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||||
|
|
||||||
|
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
|
||||||
|
|
||||||
|
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
|
||||||
|
ld1 {v29.s}[0], [x0], x2
|
||||||
|
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||||
|
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||||
|
|
||||||
|
srshr v0.4h, v0.4h, #6
|
||||||
|
srshr v1.4h, v1.4h, #6
|
||||||
|
ld1 {v31.s}[0], [x0], x2
|
||||||
|
srshr v2.4h, v2.4h, #6
|
||||||
|
srshr v3.4h, v3.4h, #6
|
||||||
|
ld1 {v30.s}[0], [x0], x2
|
||||||
|
|
||||||
|
sub x0, x0, x2, lsl #2
|
||||||
|
uaddw v0.8h, v0.8h, v28.8b
|
||||||
|
uaddw v1.8h, v1.8h, v29.8b
|
||||||
|
uaddw v2.8h, v2.8h, v30.8b
|
||||||
|
uaddw v3.8h, v3.8h, v31.8b
|
||||||
|
sqxtun v0.8b, v0.8h
|
||||||
|
sqxtun v1.8b, v1.8h
|
||||||
|
sqxtun v2.8b, v2.8h
|
||||||
|
sqxtun v3.8b, v3.8h
|
||||||
|
|
||||||
|
st1 {v0.s}[0], [x0], x2
|
||||||
|
st1 {v1.s}[0], [x0], x2
|
||||||
|
st1 {v3.s}[0], [x0], x2
|
||||||
|
st1 {v2.s}[0], [x0], x2
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add8x4_idct_neon, export=1
|
||||||
|
ld1 {v0.8h,v1.8h}, [x1], #32
|
||||||
|
ld1 {v2.8h,v3.8h}, [x1], #32
|
||||||
|
transpose v20.2d, v21.2d, v0.2d, v2.2d
|
||||||
|
transpose v22.2d, v23.2d, v1.2d, v3.2d
|
||||||
|
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||||
|
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
|
||||||
|
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
|
||||||
|
|
||||||
|
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||||
|
|
||||||
|
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
|
||||||
|
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
|
||||||
|
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
|
||||||
|
|
||||||
|
srshr v0.8h, v0.8h, #6
|
||||||
|
ld1 {v28.8b}, [x0], x2
|
||||||
|
srshr v1.8h, v1.8h, #6
|
||||||
|
ld1 {v29.8b}, [x0], x2
|
||||||
|
srshr v2.8h, v2.8h, #6
|
||||||
|
ld1 {v30.8b}, [x0], x2
|
||||||
|
srshr v3.8h, v3.8h, #6
|
||||||
|
ld1 {v31.8b}, [x0], x2
|
||||||
|
|
||||||
|
sub x0, x0, x2, lsl #2
|
||||||
|
uaddw v0.8h, v0.8h, v28.8b
|
||||||
|
uaddw v1.8h, v1.8h, v29.8b
|
||||||
|
uaddw v2.8h, v2.8h, v30.8b
|
||||||
|
uaddw v3.8h, v3.8h, v31.8b
|
||||||
|
|
||||||
|
sqxtun v0.8b, v0.8h
|
||||||
|
sqxtun v1.8b, v1.8h
|
||||||
|
st1 {v0.8b}, [x0], x2
|
||||||
|
sqxtun v2.8b, v2.8h
|
||||||
|
st1 {v1.8b}, [x0], x2
|
||||||
|
sqxtun v3.8b, v3.8h
|
||||||
|
st1 {v2.8b}, [x0], x2
|
||||||
|
st1 {v3.8b}, [x0], x2
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add8x8_idct_neon, export=1
|
||||||
|
mov x2, #FDEC_STRIDE
|
||||||
|
mov x5, x30
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
mov x30, x5
|
||||||
|
b X(add8x4_idct_neon)
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add16x16_idct_neon, export=1
|
||||||
|
mov x2, #FDEC_STRIDE
|
||||||
|
mov x5, x30
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
sub x0, x0, #8*FDEC_STRIDE-8
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
sub x0, x0, #8
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
sub x0, x0, #8*FDEC_STRIDE-8
|
||||||
|
bl X(add8x4_idct_neon)
|
||||||
|
mov x30, x5
|
||||||
|
b X(add8x4_idct_neon)
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro IDCT8_1D type
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2
|
||||||
|
.ifc \type, row
|
||||||
|
ld1 {v22.8h,v23.8h}, [x1], #32
|
||||||
|
.endif
|
||||||
|
SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4
|
||||||
|
SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h
|
||||||
|
SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1
|
||||||
|
SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h
|
||||||
|
SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3
|
||||||
|
|
||||||
|
SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5
|
||||||
|
SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7
|
||||||
|
|
||||||
|
SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6
|
||||||
|
SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4
|
||||||
|
|
||||||
|
SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h
|
||||||
|
SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h
|
||||||
|
SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h
|
||||||
|
SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function add8x8_idct8_neon, export=1
|
||||||
|
mov x2, #FDEC_STRIDE
|
||||||
|
ld1 {v16.8h,v17.8h}, [x1], #32
|
||||||
|
ld1 {v18.8h,v19.8h}, [x1], #32
|
||||||
|
ld1 {v20.8h,v21.8h}, [x1], #32
|
||||||
|
|
||||||
|
IDCT8_1D row
|
||||||
|
|
||||||
|
transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
|
||||||
|
|
||||||
|
IDCT8_1D col
|
||||||
|
|
||||||
|
ld1 {v0.8b}, [x0], x2
|
||||||
|
srshr v16.8h, v16.8h, #6
|
||||||
|
ld1 {v1.8b}, [x0], x2
|
||||||
|
srshr v17.8h, v17.8h, #6
|
||||||
|
ld1 {v2.8b}, [x0], x2
|
||||||
|
srshr v18.8h, v18.8h, #6
|
||||||
|
ld1 {v3.8b}, [x0], x2
|
||||||
|
srshr v19.8h, v19.8h, #6
|
||||||
|
ld1 {v4.8b}, [x0], x2
|
||||||
|
srshr v20.8h, v20.8h, #6
|
||||||
|
ld1 {v5.8b}, [x0], x2
|
||||||
|
srshr v21.8h, v21.8h, #6
|
||||||
|
ld1 {v6.8b}, [x0], x2
|
||||||
|
srshr v22.8h, v22.8h, #6
|
||||||
|
ld1 {v7.8b}, [x0], x2
|
||||||
|
srshr v23.8h, v23.8h, #6
|
||||||
|
sub x0, x0, x2, lsl #3
|
||||||
|
|
||||||
|
uaddw v16.8h, v16.8h, v0.8b
|
||||||
|
uaddw v17.8h, v17.8h, v1.8b
|
||||||
|
uaddw v18.8h, v18.8h, v2.8b
|
||||||
|
sqxtun v0.8b, v16.8h
|
||||||
|
sqxtun v1.8b, v17.8h
|
||||||
|
sqxtun v2.8b, v18.8h
|
||||||
|
uaddw v19.8h, v19.8h, v3.8b
|
||||||
|
st1 {v0.8b}, [x0], x2
|
||||||
|
uaddw v20.8h, v20.8h, v4.8b
|
||||||
|
st1 {v1.8b}, [x0], x2
|
||||||
|
uaddw v21.8h, v21.8h, v5.8b
|
||||||
|
st1 {v2.8b}, [x0], x2
|
||||||
|
sqxtun v3.8b, v19.8h
|
||||||
|
sqxtun v4.8b, v20.8h
|
||||||
|
uaddw v22.8h, v22.8h, v6.8b
|
||||||
|
uaddw v23.8h, v23.8h, v7.8b
|
||||||
|
st1 {v3.8b}, [x0], x2
|
||||||
|
sqxtun v5.8b, v21.8h
|
||||||
|
st1 {v4.8b}, [x0], x2
|
||||||
|
sqxtun v6.8b, v22.8h
|
||||||
|
sqxtun v7.8b, v23.8h
|
||||||
|
st1 {v5.8b}, [x0], x2
|
||||||
|
st1 {v6.8b}, [x0], x2
|
||||||
|
st1 {v7.8b}, [x0], x2
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add16x16_idct8_neon, export=1
|
||||||
|
mov x7, x30
|
||||||
|
bl X(add8x8_idct8_neon)
|
||||||
|
sub x0, x0, #8*FDEC_STRIDE-8
|
||||||
|
bl X(add8x8_idct8_neon)
|
||||||
|
sub x0, x0, #8
|
||||||
|
bl X(add8x8_idct8_neon)
|
||||||
|
sub x0, x0, #8*FDEC_STRIDE-8
|
||||||
|
mov x30, x7
|
||||||
|
b X(add8x8_idct8_neon)
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add8x8_idct_dc_neon, export=1
|
||||||
|
mov x2, #FDEC_STRIDE
|
||||||
|
ld1 {v16.4h}, [x1]
|
||||||
|
ld1 {v0.8b}, [x0], x2
|
||||||
|
srshr v16.4h, v16.4h, #6
|
||||||
|
ld1 {v1.8b}, [x0], x2
|
||||||
|
dup v20.8h, v16.h[0]
|
||||||
|
dup v21.8h, v16.h[1]
|
||||||
|
ld1 {v2.8b}, [x0], x2
|
||||||
|
dup v22.8h, v16.h[2]
|
||||||
|
dup v23.8h, v16.h[3]
|
||||||
|
ld1 {v3.8b}, [x0], x2
|
||||||
|
trn1 v20.2d, v20.2d, v21.2d
|
||||||
|
ld1 {v4.8b}, [x0], x2
|
||||||
|
trn1 v21.2d, v22.2d, v23.2d
|
||||||
|
ld1 {v5.8b}, [x0], x2
|
||||||
|
neg v22.8h, v20.8h
|
||||||
|
ld1 {v6.8b}, [x0], x2
|
||||||
|
neg v23.8h, v21.8h
|
||||||
|
ld1 {v7.8b}, [x0], x2
|
||||||
|
|
||||||
|
sub x0, x0, #8*FDEC_STRIDE
|
||||||
|
|
||||||
|
sqxtun v20.8b, v20.8h
|
||||||
|
sqxtun v21.8b, v21.8h
|
||||||
|
sqxtun v22.8b, v22.8h
|
||||||
|
sqxtun v23.8b, v23.8h
|
||||||
|
|
||||||
|
uqadd v0.8b, v0.8b, v20.8b
|
||||||
|
uqadd v1.8b, v1.8b, v20.8b
|
||||||
|
uqadd v2.8b, v2.8b, v20.8b
|
||||||
|
uqadd v3.8b, v3.8b, v20.8b
|
||||||
|
uqadd v4.8b, v4.8b, v21.8b
|
||||||
|
uqadd v5.8b, v5.8b, v21.8b
|
||||||
|
uqadd v6.8b, v6.8b, v21.8b
|
||||||
|
uqadd v7.8b, v7.8b, v21.8b
|
||||||
|
uqsub v0.8b, v0.8b, v22.8b
|
||||||
|
uqsub v1.8b, v1.8b, v22.8b
|
||||||
|
uqsub v2.8b, v2.8b, v22.8b
|
||||||
|
uqsub v3.8b, v3.8b, v22.8b
|
||||||
|
uqsub v4.8b, v4.8b, v23.8b
|
||||||
|
uqsub v5.8b, v5.8b, v23.8b
|
||||||
|
uqsub v6.8b, v6.8b, v23.8b
|
||||||
|
uqsub v7.8b, v7.8b, v23.8b
|
||||||
|
|
||||||
|
st1 {v0.8b}, [x0], x2
|
||||||
|
st1 {v1.8b}, [x0], x2
|
||||||
|
st1 {v2.8b}, [x0], x2
|
||||||
|
st1 {v3.8b}, [x0], x2
|
||||||
|
st1 {v4.8b}, [x0], x2
|
||||||
|
st1 {v5.8b}, [x0], x2
|
||||||
|
st1 {v6.8b}, [x0], x2
|
||||||
|
st1 {v7.8b}, [x0], x2
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro ADD16x4_IDCT_DC dc
|
||||||
|
ld1 {v4.16b}, [x0], x3
|
||||||
|
dup v24.8h, \dc[0]
|
||||||
|
dup v25.8h, \dc[1]
|
||||||
|
ld1 {v5.16b}, [x0], x3
|
||||||
|
dup v26.8h, \dc[2]
|
||||||
|
dup v27.8h, \dc[3]
|
||||||
|
ld1 {v6.16b}, [x0], x3
|
||||||
|
trn1 v24.2d, v24.2d, v25.2d
|
||||||
|
ld1 {v7.16b}, [x0], x3
|
||||||
|
trn1 v25.2d, v26.2d, v27.2d
|
||||||
|
neg v26.8h, v24.8h
|
||||||
|
neg v27.8h, v25.8h
|
||||||
|
|
||||||
|
sqxtun v20.8b, v24.8h
|
||||||
|
sqxtun v21.8b, v26.8h
|
||||||
|
sqxtun2 v20.16b, v25.8h
|
||||||
|
sqxtun2 v21.16b, v27.8h
|
||||||
|
|
||||||
|
uqadd v4.16b, v4.16b, v20.16b
|
||||||
|
uqadd v5.16b, v5.16b, v20.16b
|
||||||
|
uqadd v6.16b, v6.16b, v20.16b
|
||||||
|
uqadd v7.16b, v7.16b, v20.16b
|
||||||
|
|
||||||
|
uqsub v4.16b, v4.16b, v21.16b
|
||||||
|
uqsub v5.16b, v5.16b, v21.16b
|
||||||
|
uqsub v6.16b, v6.16b, v21.16b
|
||||||
|
st1 {v4.16b}, [x2], x3
|
||||||
|
uqsub v7.16b, v7.16b, v21.16b
|
||||||
|
st1 {v5.16b}, [x2], x3
|
||||||
|
st1 {v6.16b}, [x2], x3
|
||||||
|
st1 {v7.16b}, [x2], x3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function add16x16_idct_dc_neon, export=1
|
||||||
|
mov x2, x0
|
||||||
|
mov x3, #FDEC_STRIDE
|
||||||
|
|
||||||
|
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
|
||||||
|
srshr v0.4h, v0.4h, #6
|
||||||
|
srshr v1.4h, v1.4h, #6
|
||||||
|
|
||||||
|
ADD16x4_IDCT_DC v0.h
|
||||||
|
srshr v2.4h, v2.4h, #6
|
||||||
|
ADD16x4_IDCT_DC v1.h
|
||||||
|
srshr v3.4h, v3.4h, #6
|
||||||
|
ADD16x4_IDCT_DC v2.h
|
||||||
|
ADD16x4_IDCT_DC v3.h
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
|
||||||
|
ld1 {\t0\().8b}, [x1], x3
|
||||||
|
ld1 {\t1\().8b}, [x2], x4
|
||||||
|
ld1 {\t2\().8b}, [x1], x3
|
||||||
|
ld1 {\t3\().8b}, [x2], x4
|
||||||
|
usubl \t0\().8h, \t0\().8b, \t1\().8b
|
||||||
|
ld1 {\t4\().8b}, [x1], x3
|
||||||
|
ld1 {\t5\().8b}, [x2], x4
|
||||||
|
usubl \t1\().8h, \t2\().8b, \t3\().8b
|
||||||
|
ld1 {\t6\().8b}, [x1], x3
|
||||||
|
ld1 {\t7\().8b}, [x2], x4
|
||||||
|
add \dst\().8h, \t0\().8h, \t1\().8h
|
||||||
|
usubl \t2\().8h, \t4\().8b, \t5\().8b
|
||||||
|
usubl \t3\().8h, \t6\().8b, \t7\().8b
|
||||||
|
add \dst\().8h, \dst\().8h, \t2\().8h
|
||||||
|
add \dst\().8h, \dst\().8h, \t3\().8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function sub8x8_dct_dc_neon, export=1
|
||||||
|
mov x3, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
|
||||||
|
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
|
||||||
|
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
|
||||||
|
|
||||||
|
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||||
|
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||||
|
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||||
|
|
||||||
|
addp v0.8h, v2.8h, v3.8h
|
||||||
|
addp v0.8h, v0.8h, v0.8h
|
||||||
|
|
||||||
|
st1 {v0.4h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub8x16_dct_dc_neon, export=1
|
||||||
|
mov x3, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
|
||||||
|
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
|
||||||
|
sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
|
||||||
|
sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
|
||||||
|
|
||||||
|
addp v4.8h, v0.8h, v2.8h
|
||||||
|
addp v5.8h, v1.8h, v3.8h
|
||||||
|
|
||||||
|
transpose v2.4s, v3.4s, v4.4s, v5.4s
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||||
|
|
||||||
|
transpose v2.4s, v3.4s, v0.4s, v1.4s
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||||
|
|
||||||
|
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||||
|
|
||||||
|
trn1 v2.2d, v0.2d, v1.2d
|
||||||
|
trn2 v3.2d, v1.2d, v0.2d
|
||||||
|
|
||||||
|
addp v0.8h, v2.8h, v3.8h
|
||||||
|
|
||||||
|
st1 {v0.8h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function zigzag_interleave_8x8_cavlc_neon, export=1
|
||||||
|
mov x3, #7
|
||||||
|
movi v31.4s, #1
|
||||||
|
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
|
||||||
|
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
|
||||||
|
umax v16.8h, v0.8h, v4.8h
|
||||||
|
umax v17.8h, v1.8h, v5.8h
|
||||||
|
umax v18.8h, v2.8h, v6.8h
|
||||||
|
umax v19.8h, v3.8h, v7.8h
|
||||||
|
st1 {v0.8h}, [x0], #16
|
||||||
|
st1 {v4.8h}, [x0], #16
|
||||||
|
umaxp v16.8h, v16.8h, v17.8h
|
||||||
|
umaxp v18.8h, v18.8h, v19.8h
|
||||||
|
st1 {v1.8h}, [x0], #16
|
||||||
|
st1 {v5.8h}, [x0], #16
|
||||||
|
umaxp v16.8h, v16.8h, v18.8h
|
||||||
|
st1 {v2.8h}, [x0], #16
|
||||||
|
st1 {v6.8h}, [x0], #16
|
||||||
|
cmhs v16.4s, v16.4s, v31.4s
|
||||||
|
st1 {v3.8h}, [x0], #16
|
||||||
|
and v16.16b, v16.16b, v31.16b
|
||||||
|
st1 {v7.8h}, [x0], #16
|
||||||
|
st1 {v16.b}[0], [x2], #1
|
||||||
|
st1 {v16.b}[4], [x2], x3
|
||||||
|
st1 {v16.b}[8], [x2], #1
|
||||||
|
st1 {v16.b}[12], [x2]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function zigzag_scan_4x4_frame_neon, export=1
|
||||||
|
movrel x2, scan4x4_frame
|
||||||
|
ld1 {v0.16b,v1.16b}, [x1]
|
||||||
|
ld1 {v16.16b,v17.16b}, [x2]
|
||||||
|
tbl v2.16b, {v0.16b,v1.16b}, v16.16b
|
||||||
|
tbl v3.16b, {v0.16b,v1.16b}, v17.16b
|
||||||
|
st1 {v2.16b,v3.16b}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro zigzag_sub_4x4 f ac
|
||||||
|
function zigzag_sub_4x4\ac\()_\f\()_neon, export=1
|
||||||
|
mov x9, #FENC_STRIDE
|
||||||
|
mov x4, #FDEC_STRIDE
|
||||||
|
movrel x5, sub4x4_\f
|
||||||
|
mov x6, x2
|
||||||
|
ld1 {v0.s}[0], [x1], x9
|
||||||
|
ld1 {v0.s}[1], [x1], x9
|
||||||
|
ld1 {v0.s}[2], [x1], x9
|
||||||
|
ld1 {v0.s}[3], [x1], x9
|
||||||
|
ld1 {v16.16b}, [x5]
|
||||||
|
ld1 {v1.s}[0], [x2], x4
|
||||||
|
ld1 {v1.s}[1], [x2], x4
|
||||||
|
ld1 {v1.s}[2], [x2], x4
|
||||||
|
ld1 {v1.s}[3], [x2], x4
|
||||||
|
tbl v2.16b, {v0.16b}, v16.16b
|
||||||
|
tbl v3.16b, {v1.16b}, v16.16b
|
||||||
|
st1 {v0.s}[0], [x6], x4
|
||||||
|
usubl v4.8h, v2.8b, v3.8b
|
||||||
|
.ifc \ac, ac
|
||||||
|
dup h7, v4.h[0]
|
||||||
|
ins v4.h[0], wzr
|
||||||
|
fmov w5, s7
|
||||||
|
strh w5, [x3]
|
||||||
|
.endif
|
||||||
|
usubl2 v5.8h, v2.16b, v3.16b
|
||||||
|
st1 {v0.s}[1], [x6], x4
|
||||||
|
umax v6.8h, v4.8h, v5.8h
|
||||||
|
umaxv h6, v6.8h
|
||||||
|
st1 {v0.s}[2], [x6], x4
|
||||||
|
fmov w7, s6
|
||||||
|
st1 {v0.s}[3], [x6], x4
|
||||||
|
cmp w7, #0
|
||||||
|
st1 {v4.8h,v5.8h}, [x0]
|
||||||
|
cset w0, ne
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
zigzag_sub_4x4 field
|
||||||
|
zigzag_sub_4x4 field, ac
|
||||||
|
zigzag_sub_4x4 frame
|
||||||
|
zigzag_sub_4x4 frame, ac
|
||||||
|
|
||||||
|
function zigzag_scan_4x4_field_neon, export=1
|
||||||
|
movrel x2, scan4x4_field
|
||||||
|
ld1 {v0.8h,v1.8h}, [x1]
|
||||||
|
ld1 {v16.16b}, [x2]
|
||||||
|
tbl v0.16b, {v0.16b}, v16.16b
|
||||||
|
st1 {v0.8h,v1.8h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function zigzag_scan_8x8_frame_neon, export=1
|
||||||
|
movrel x2, scan8x8_frame
|
||||||
|
ld1 {v0.8h,v1.8h}, [x1], #32
|
||||||
|
ld1 {v2.8h,v3.8h}, [x1], #32
|
||||||
|
ld1 {v4.8h,v5.8h}, [x1], #32
|
||||||
|
ld1 {v6.8h,v7.8h}, [x1]
|
||||||
|
ld1 {v16.16b,v17.16b}, [x2], #32
|
||||||
|
ld1 {v18.16b,v19.16b}, [x2], #32
|
||||||
|
ld1 {v20.16b,v21.16b}, [x2], #32
|
||||||
|
ld1 {v22.16b,v23.16b}, [x2], #32
|
||||||
|
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
|
||||||
|
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
|
||||||
|
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
|
||||||
|
tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
|
||||||
|
tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
|
||||||
|
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
|
||||||
|
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
|
||||||
|
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
|
||||||
|
mov v25.h[6], v4.h[0]
|
||||||
|
mov v25.h[7], v5.h[0]
|
||||||
|
mov v26.h[0], v4.h[1]
|
||||||
|
mov v27.h[4], v7.h[0]
|
||||||
|
mov v28.h[7], v4.h[4]
|
||||||
|
mov v29.h[7], v3.h[6]
|
||||||
|
mov v30.h[0], v2.h[7]
|
||||||
|
mov v30.h[1], v3.h[7]
|
||||||
|
st1 {v24.8h,v25.8h}, [x0], #32
|
||||||
|
st1 {v26.8h,v27.8h}, [x0], #32
|
||||||
|
st1 {v28.8h,v29.8h}, [x0], #32
|
||||||
|
st1 {v30.8h,v31.8h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
#define Z(z) 2*(z), 2*(z)+1
|
||||||
|
#define T(x,y) Z(x*8+y)
|
||||||
|
const scan8x8_frame, align=5
|
||||||
|
.byte T(0,0), T(1,0), T(0,1), T(0,2)
|
||||||
|
.byte T(1,1), T(2,0), T(3,0), T(2,1)
|
||||||
|
.byte T(1,2), T(0,3), T(0,4), T(1,3)
|
||||||
|
.byte T(2,2), T(3,1), T(4,0), T(5,0)
|
||||||
|
.byte T(4,1), T(3,2), T(2,3), T(1,4)
|
||||||
|
.byte T(0,5), T(0,6), T(1,5), T(2,4)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-3)*8+y)
|
||||||
|
.byte T(3,3), T(4,2), T(5,1), T(6,0)
|
||||||
|
.byte T(7,0), T(6,1), T(5,2), T(4,3)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-0)*8+y)
|
||||||
|
.byte T(3,4), T(2,5), T(1,6), T(0,7)
|
||||||
|
.byte T(1,7), T(2,6), T(3,5), T(4,4)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-4)*8+y)
|
||||||
|
.byte T(5,3), T(6,2), T(7,1), T(7,2)
|
||||||
|
.byte T(6,3), T(5,4), T(4,5), T(3,6)
|
||||||
|
.byte T(2,7), T(3,7), T(4,6), T(5,5)
|
||||||
|
.byte T(6,4), T(7,3), T(7,4), T(6,5)
|
||||||
|
.byte T(5,6), T(4,7), T(5,7), T(6,6)
|
||||||
|
.byte T(7,5), T(7,6), T(6,7), T(7,7)
|
||||||
|
endconst
|
||||||
|
|
||||||
|
function zigzag_scan_8x8_field_neon, export=1
|
||||||
|
movrel x2, scan8x8_field
|
||||||
|
ld1 {v0.8h,v1.8h}, [x1], #32
|
||||||
|
ld1 {v2.8h,v3.8h}, [x1], #32
|
||||||
|
ld1 {v4.8h,v5.8h}, [x1], #32
|
||||||
|
ld1 {v6.8h,v7.8h}, [x1]
|
||||||
|
ld1 {v16.16b,v17.16b}, [x2], #32
|
||||||
|
ld1 {v18.16b,v19.16b}, [x2], #32
|
||||||
|
ld1 {v20.16b,v21.16b}, [x2], #32
|
||||||
|
ld1 {v22.16b}, [x2]
|
||||||
|
ext v31.16b, v7.16b, v7.16b, #4
|
||||||
|
tbl v24.16b, {v0.16b,v1.16b}, v16.16b
|
||||||
|
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
|
||||||
|
tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
|
||||||
|
tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
|
||||||
|
tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
|
||||||
|
tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
|
||||||
|
tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
|
||||||
|
ext v31.16b, v6.16b, v31.16b, #12
|
||||||
|
st1 {v24.8h,v25.8h}, [x0], #32
|
||||||
|
st1 {v26.8h,v27.8h}, [x0], #32
|
||||||
|
st1 {v28.8h,v29.8h}, [x0], #32
|
||||||
|
st1 {v30.8h,v31.8h}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro zigzag_sub8x8 f
|
||||||
|
function zigzag_sub_8x8_\f\()_neon, export=1
|
||||||
|
movrel x4, sub8x8_\f
|
||||||
|
mov x5, #FENC_STRIDE
|
||||||
|
mov x6, #FDEC_STRIDE
|
||||||
|
mov x7, x2
|
||||||
|
ld1 {v0.d}[0], [x1], x5
|
||||||
|
ld1 {v0.d}[1], [x1], x5
|
||||||
|
ld1 {v1.d}[0], [x1], x5
|
||||||
|
ld1 {v1.d}[1], [x1], x5
|
||||||
|
ld1 {v2.d}[0], [x1], x5
|
||||||
|
ld1 {v2.d}[1], [x1], x5
|
||||||
|
ld1 {v3.d}[0], [x1], x5
|
||||||
|
ld1 {v3.d}[1], [x1]
|
||||||
|
ld1 {v4.d}[0], [x2], x6
|
||||||
|
ld1 {v4.d}[1], [x2], x6
|
||||||
|
ld1 {v5.d}[0], [x2], x6
|
||||||
|
ld1 {v5.d}[1], [x2], x6
|
||||||
|
ld1 {v6.d}[0], [x2], x6
|
||||||
|
ld1 {v6.d}[1], [x2], x6
|
||||||
|
ld1 {v7.d}[0], [x2], x6
|
||||||
|
ld1 {v7.d}[1], [x2]
|
||||||
|
ld1 {v16.16b,v17.16b}, [x4], #32
|
||||||
|
ld1 {v18.16b,v19.16b}, [x4], #32
|
||||||
|
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
|
||||||
|
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
|
||||||
|
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
|
||||||
|
tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
|
||||||
|
tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
|
||||||
|
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
|
||||||
|
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
|
||||||
|
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
|
||||||
|
usubl v4.8h, v24.8b, v28.8b
|
||||||
|
usubl2 v5.8h, v24.16b, v28.16b
|
||||||
|
usubl v6.8h, v25.8b, v29.8b
|
||||||
|
usubl2 v7.8h, v25.16b, v29.16b
|
||||||
|
usubl v16.8h, v26.8b, v30.8b
|
||||||
|
usubl2 v17.8h, v26.16b, v30.16b
|
||||||
|
usubl v18.8h, v27.8b, v31.8b
|
||||||
|
usubl2 v19.8h, v27.16b, v31.16b
|
||||||
|
umax v20.8h, v4.8h, v5.8h
|
||||||
|
umax v21.8h, v6.8h, v7.8h
|
||||||
|
umax v22.8h, v16.8h, v17.8h
|
||||||
|
umax v23.8h, v18.8h, v19.8h
|
||||||
|
umax v20.8h, v20.8h, v21.8h
|
||||||
|
umax v21.8h, v22.8h, v23.8h
|
||||||
|
umax v20.8h, v20.8h, v21.8h
|
||||||
|
umaxv h22, v20.8h
|
||||||
|
st1 {v0.d}[0], [x7], x6
|
||||||
|
st1 {v0.d}[1], [x7], x6
|
||||||
|
st1 {v1.d}[0], [x7], x6
|
||||||
|
st1 {v1.d}[1], [x7], x6
|
||||||
|
st1 {v2.d}[0], [x7], x6
|
||||||
|
st1 {v2.d}[1], [x7], x6
|
||||||
|
st1 {v3.d}[0], [x7], x6
|
||||||
|
st1 {v3.d}[1], [x7]
|
||||||
|
st1 {v4.8h,v5.8h}, [x0], #32
|
||||||
|
st1 {v6.8h,v7.8h}, [x0], #32
|
||||||
|
st1 {v16.8h,v17.8h}, [x0], #32
|
||||||
|
st1 {v18.8h,v19.8h}, [x0]
|
||||||
|
fmov w9, s22
|
||||||
|
cmp w9, #0
|
||||||
|
cset w0, ne
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
zigzag_sub8x8 field
|
||||||
|
zigzag_sub8x8 frame
|
||||||
|
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z(x*8+y)
|
||||||
|
const scan8x8_field, align=5
|
||||||
|
.byte T(0,0), T(0,1), T(0,2), T(1,0)
|
||||||
|
.byte T(1,1), T(0,3), T(0,4), T(1,2)
|
||||||
|
.byte T(2,0), T(1,3), T(0,5), T(0,6)
|
||||||
|
.byte T(0,7), T(1,4), T(2,1), T(3,0)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-1)*8+y)
|
||||||
|
.byte T(2,2), T(1,5), T(1,6), T(1,7)
|
||||||
|
.byte T(2,3), T(3,1), T(4,0), T(3,2)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-2)*8+y)
|
||||||
|
.byte T(2,4), T(2,5), T(2,6), T(2,7)
|
||||||
|
.byte T(3,3), T(4,1), T(5,0), T(4,2)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-3)*8+y)
|
||||||
|
.byte T(3,4), T(3,5), T(3,6), T(3,7)
|
||||||
|
.byte T(4,3), T(5,1), T(6,0), T(5,2)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-4)*8+y)
|
||||||
|
.byte T(4,4), T(4,5), T(4,6), T(4,7)
|
||||||
|
.byte T(5,3), T(6,1), T(6,2), T(5,4)
|
||||||
|
#undef T
|
||||||
|
#define T(x,y) Z((x-5)*8+y)
|
||||||
|
.byte T(5,5), T(5,6), T(5,7), T(6,3)
|
||||||
|
.byte T(7,0), T(7,1), T(6,4), T(6,5)
|
||||||
|
endconst
|
||||||
|
|
||||||
|
|
||||||
|
#undef T
|
||||||
|
#define T(y,x) x*8+y
|
||||||
|
const sub8x8_frame, align=5
|
||||||
|
.byte T(0,0), T(1,0), T(0,1), T(0,2)
|
||||||
|
.byte T(1,1), T(2,0), T(3,0), T(2,1)
|
||||||
|
.byte T(1,2), T(0,3), T(0,4), T(1,3)
|
||||||
|
.byte T(2,2), T(3,1), T(4,0), T(5,0)
|
||||||
|
.byte T(4,1), T(3,2), T(2,3), T(1,4)
|
||||||
|
.byte T(0,5), T(0,6), T(1,5), T(2,4)
|
||||||
|
.byte T(3,3), T(4,2), T(5,1), T(6,0)
|
||||||
|
.byte T(7,0), T(6,1), T(5,2), T(4,3)
|
||||||
|
.byte T(3,4), T(2,5), T(1,6), T(0,7)
|
||||||
|
.byte T(1,7), T(2,6), T(3,5), T(4,4)
|
||||||
|
.byte T(5,3), T(6,2), T(7,1), T(7,2)
|
||||||
|
.byte T(6,3), T(5,4), T(4,5), T(3,6)
|
||||||
|
.byte T(2,7), T(3,7), T(4,6), T(5,5)
|
||||||
|
.byte T(6,4), T(7,3), T(7,4), T(6,5)
|
||||||
|
.byte T(5,6), T(4,7), T(5,7), T(6,6)
|
||||||
|
.byte T(7,5), T(7,6), T(6,7), T(7,7)
|
||||||
|
endconst
|
||||||
|
|
||||||
|
const sub8x8_field, align=5
|
||||||
|
.byte T(0,0), T(0,1), T(0,2), T(1,0)
|
||||||
|
.byte T(1,1), T(0,3), T(0,4), T(1,2)
|
||||||
|
.byte T(2,0), T(1,3), T(0,5), T(0,6)
|
||||||
|
.byte T(0,7), T(1,4), T(2,1), T(3,0)
|
||||||
|
.byte T(2,2), T(1,5), T(1,6), T(1,7)
|
||||||
|
.byte T(2,3), T(3,1), T(4,0), T(3,2)
|
||||||
|
.byte T(2,4), T(2,5), T(2,6), T(2,7)
|
||||||
|
.byte T(3,3), T(4,1), T(5,0), T(4,2)
|
||||||
|
.byte T(3,4), T(3,5), T(3,6), T(3,7)
|
||||||
|
.byte T(4,3), T(5,1), T(6,0), T(5,2)
|
||||||
|
.byte T(4,4), T(4,5), T(4,6), T(4,7)
|
||||||
|
.byte T(5,3), T(6,1), T(6,2), T(5,4)
|
||||||
|
.byte T(5,5), T(5,6), T(5,7), T(6,3)
|
||||||
|
.byte T(7,0), T(7,1), T(6,4), T(6,5)
|
||||||
|
.byte T(6,6), T(6,7), T(7,2), T(7,3)
|
||||||
|
.byte T(7,4), T(7,5), T(7,6), T(7,7)
|
||||||
|
endconst
|
||||||
103
common/aarch64/dct.h
Normal file
103
common/aarch64/dct.h
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* dct.h: aarch64 transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_DCT_H
|
||||||
|
#define X264_AARCH64_DCT_H
|
||||||
|
|
||||||
|
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
|
||||||
|
void x264_dct4x4dc_neon( int16_t d[16] );
|
||||||
|
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
|
||||||
|
void x264_idct4x4dc_neon( int16_t d[16] );
|
||||||
|
|
||||||
|
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
|
||||||
|
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
|
||||||
|
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
|
||||||
|
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
|
||||||
|
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
|
||||||
|
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||||
|
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
|
||||||
|
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
|
||||||
|
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
|
||||||
|
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
|
||||||
|
|
||||||
|
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
|
||||||
|
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
|
||||||
|
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
|
||||||
|
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||||
|
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
|
||||||
|
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
|
||||||
|
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
|
||||||
|
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
|
||||||
|
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
|
||||||
|
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
|
||||||
|
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
|
||||||
|
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
|
||||||
|
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
|
||||||
|
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
|
||||||
|
|
||||||
|
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
|
||||||
|
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
|
||||||
|
#define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon)
|
||||||
|
void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
|
||||||
|
#define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon)
|
||||||
|
void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
|
||||||
|
#define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon)
|
||||||
|
void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
|
||||||
|
|
||||||
|
#define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon)
|
||||||
|
int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||||
|
#define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon)
|
||||||
|
int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
|
||||||
|
#define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon)
|
||||||
|
int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||||
|
#define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon)
|
||||||
|
int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
|
||||||
|
|
||||||
|
#define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon)
|
||||||
|
int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||||
|
#define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon)
|
||||||
|
int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||||
|
|
||||||
|
#define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
|
||||||
|
void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||||
|
|
||||||
|
#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
|
||||||
|
void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
|
||||||
|
#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
|
||||||
|
void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
|
||||||
|
|
||||||
|
#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
|
||||||
|
void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||||
|
|
||||||
|
#endif
|
||||||
43
common/aarch64/deblock-a-common.S
Normal file
43
common/aarch64/deblock-a-common.S
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock-a-common.S: aarch64 deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Mans Rullgard <mans@mansr.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
* David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
// This file contains the NEON macros that are intended to be used by
|
||||||
|
// the SVE/SVE2 functions as well
|
||||||
|
|
||||||
|
.macro h264_loop_filter_start
|
||||||
|
cmp w2, #0
|
||||||
|
ldr w6, [x4]
|
||||||
|
ccmp w3, #0, #0, ne
|
||||||
|
mov v24.s[0], w6
|
||||||
|
and w8, w6, w6, lsl #16
|
||||||
|
b.eq 1f
|
||||||
|
ands w8, w8, w8, lsl #8
|
||||||
|
b.ge 2f
|
||||||
|
1:
|
||||||
|
ret
|
||||||
|
2:
|
||||||
|
.endm
|
||||||
98
common/aarch64/deblock-a-sve.S
Normal file
98
common/aarch64/deblock-a-sve.S
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock-a-sve.S: aarch64 deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "deblock-a-common.S"
|
||||||
|
|
||||||
|
ENABLE_SVE
|
||||||
|
|
||||||
|
.macro h264_loop_filter_chroma_sve
|
||||||
|
ptrue p0.b, vl16
|
||||||
|
|
||||||
|
dup v22.16b, w2 // alpha
|
||||||
|
uxtl v24.8h, v24.8b
|
||||||
|
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
|
||||||
|
uxtl v4.8h, v0.8b
|
||||||
|
uxtl2 v5.8h, v0.16b
|
||||||
|
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||||
|
usubw v4.8h, v4.8h, v16.8b
|
||||||
|
usubw2 v5.8h, v5.8h, v16.16b
|
||||||
|
sli v24.8h, v24.8h, #8
|
||||||
|
shl v4.8h, v4.8h, #2
|
||||||
|
shl v5.8h, v5.8h, #2
|
||||||
|
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
|
||||||
|
uxtl v24.4s, v24.4h
|
||||||
|
uaddw v4.8h, v4.8h, v18.8b
|
||||||
|
uaddw2 v5.8h, v5.8h, v18.16b
|
||||||
|
|
||||||
|
cmphi p1.b, p0/z, z22.b, z26.b
|
||||||
|
usubw v4.8h, v4.8h, v2.8b
|
||||||
|
usubw2 v5.8h, v5.8h, v2.16b
|
||||||
|
sli v24.4s, v24.4s, #16
|
||||||
|
dup v22.16b, w3 // beta
|
||||||
|
rshrn v4.8b, v4.8h, #3
|
||||||
|
rshrn2 v4.16b, v5.8h, #3
|
||||||
|
cmphi p2.b, p0/z, z22.b, z28.b
|
||||||
|
cmphi p3.b, p0/z, z22.b, z30.b
|
||||||
|
smin v4.16b, v4.16b, v24.16b
|
||||||
|
neg v25.16b, v24.16b
|
||||||
|
and p1.b, p0/z, p1.b, p2.b
|
||||||
|
smax v4.16b, v4.16b, v25.16b
|
||||||
|
and p1.b, p0/z, p1.b, p3.b
|
||||||
|
uxtl v22.8h, v0.8b
|
||||||
|
uxtl2 v23.8h, v0.16b
|
||||||
|
|
||||||
|
uxtl v28.8h, v16.8b
|
||||||
|
uxtl2 v29.8h, v16.16b
|
||||||
|
saddw v28.8h, v28.8h, v4.8b
|
||||||
|
saddw2 v29.8h, v29.8h, v4.16b
|
||||||
|
ssubw v22.8h, v22.8h, v4.8b
|
||||||
|
ssubw2 v23.8h, v23.8h, v4.16b
|
||||||
|
sqxtun v16.8b, v28.8h
|
||||||
|
sqxtun v0.8b, v22.8h
|
||||||
|
sqxtun2 v16.16b, v29.8h
|
||||||
|
sqxtun2 v0.16b, v23.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_chroma_sve, export=1
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
// No performance improvement if sve load is used. So, continue using
|
||||||
|
// NEON load here
|
||||||
|
ld1 {v18.16b}, [x0], x1
|
||||||
|
ld1 {v16.16b}, [x0], x1
|
||||||
|
ld1 {v0.16b}, [x0], x1
|
||||||
|
ld1 {v2.16b}, [x0]
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_sve
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
st1b {z16.b}, p1, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
st1b {z0.b}, p1, [x0]
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
800
common/aarch64/deblock-a.S
Normal file
800
common/aarch64/deblock-a.S
Normal file
@@ -0,0 +1,800 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock.S: aarch64 deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Mans Rullgard <mans@mansr.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "deblock-a-common.S"
|
||||||
|
|
||||||
|
.macro h264_loop_filter_luma
|
||||||
|
dup v22.16b, w2 // alpha
|
||||||
|
uxtl v24.8h, v24.8b
|
||||||
|
uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
|
||||||
|
uxtl v24.4s, v24.4h
|
||||||
|
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||||
|
sli v24.8h, v24.8h, #8
|
||||||
|
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
|
||||||
|
sli v24.4s, v24.4s, #16
|
||||||
|
cmhi v21.16b, v22.16b, v21.16b // < alpha
|
||||||
|
dup v22.16b, w3 // beta
|
||||||
|
cmlt v23.16b, v24.16b, #0
|
||||||
|
cmhi v28.16b, v22.16b, v28.16b // < beta
|
||||||
|
cmhi v30.16b, v22.16b, v30.16b // < beta
|
||||||
|
bic v21.16b, v21.16b, v23.16b
|
||||||
|
uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
|
||||||
|
and v21.16b, v21.16b, v28.16b
|
||||||
|
uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
|
||||||
|
cmhi v17.16b, v22.16b, v17.16b // < beta
|
||||||
|
and v21.16b, v21.16b, v30.16b
|
||||||
|
cmhi v19.16b, v22.16b, v19.16b // < beta
|
||||||
|
and v17.16b, v17.16b, v21.16b
|
||||||
|
and v19.16b, v19.16b, v21.16b
|
||||||
|
and v24.16b, v24.16b, v21.16b
|
||||||
|
urhadd v28.16b, v16.16b, v0.16b
|
||||||
|
sub v21.16b, v24.16b, v17.16b
|
||||||
|
uqadd v23.16b, v18.16b, v24.16b
|
||||||
|
uhadd v20.16b, v20.16b, v28.16b
|
||||||
|
sub v21.16b, v21.16b, v19.16b
|
||||||
|
uhadd v28.16b, v4.16b, v28.16b
|
||||||
|
umin v23.16b, v23.16b, v20.16b
|
||||||
|
uqsub v22.16b, v18.16b, v24.16b
|
||||||
|
uqadd v4.16b, v2.16b, v24.16b
|
||||||
|
umax v23.16b, v23.16b, v22.16b
|
||||||
|
uqsub v22.16b, v2.16b, v24.16b
|
||||||
|
umin v28.16b, v4.16b, v28.16b
|
||||||
|
uxtl v4.8h, v0.8b
|
||||||
|
umax v28.16b, v28.16b, v22.16b
|
||||||
|
uxtl2 v20.8h, v0.16b
|
||||||
|
usubw v4.8h, v4.8h, v16.8b
|
||||||
|
usubw2 v20.8h, v20.8h, v16.16b
|
||||||
|
shl v4.8h, v4.8h, #2
|
||||||
|
shl v20.8h, v20.8h, #2
|
||||||
|
uaddw v4.8h, v4.8h, v18.8b
|
||||||
|
uaddw2 v20.8h, v20.8h, v18.16b
|
||||||
|
usubw v4.8h, v4.8h, v2.8b
|
||||||
|
usubw2 v20.8h, v20.8h, v2.16b
|
||||||
|
rshrn v4.8b, v4.8h, #3
|
||||||
|
rshrn2 v4.16b, v20.8h, #3
|
||||||
|
bsl v17.16b, v23.16b, v18.16b
|
||||||
|
bsl v19.16b, v28.16b, v2.16b
|
||||||
|
neg v23.16b, v21.16b
|
||||||
|
uxtl v28.8h, v16.8b
|
||||||
|
smin v4.16b, v4.16b, v21.16b
|
||||||
|
uxtl2 v21.8h, v16.16b
|
||||||
|
smax v4.16b, v4.16b, v23.16b
|
||||||
|
uxtl v22.8h, v0.8b
|
||||||
|
uxtl2 v24.8h, v0.16b
|
||||||
|
saddw v28.8h, v28.8h, v4.8b
|
||||||
|
saddw2 v21.8h, v21.8h, v4.16b
|
||||||
|
ssubw v22.8h, v22.8h, v4.8b
|
||||||
|
ssubw2 v24.8h, v24.8h, v4.16b
|
||||||
|
sqxtun v16.8b, v28.8h
|
||||||
|
sqxtun2 v16.16b, v21.8h
|
||||||
|
sqxtun v0.8b, v22.8h
|
||||||
|
sqxtun2 v0.16b, v24.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_luma_neon, export=1
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
ld1 {v0.16b}, [x0], x1
|
||||||
|
ld1 {v2.16b}, [x0], x1
|
||||||
|
ld1 {v4.16b}, [x0], x1
|
||||||
|
sub x0, x0, x1, lsl #2
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
ld1 {v20.16b}, [x0], x1
|
||||||
|
ld1 {v18.16b}, [x0], x1
|
||||||
|
ld1 {v16.16b}, [x0], x1
|
||||||
|
|
||||||
|
h264_loop_filter_luma
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
st1 {v17.16b}, [x0], x1
|
||||||
|
st1 {v16.16b}, [x0], x1
|
||||||
|
st1 {v0.16b}, [x0], x1
|
||||||
|
st1 {v19.16b}, [x0]
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_luma_neon, export=1
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub x0, x0, #4
|
||||||
|
ld1 {v6.8b}, [x0], x1
|
||||||
|
ld1 {v20.8b}, [x0], x1
|
||||||
|
ld1 {v18.8b}, [x0], x1
|
||||||
|
ld1 {v16.8b}, [x0], x1
|
||||||
|
ld1 {v0.8b}, [x0], x1
|
||||||
|
ld1 {v2.8b}, [x0], x1
|
||||||
|
ld1 {v4.8b}, [x0], x1
|
||||||
|
ld1 {v26.8b}, [x0], x1
|
||||||
|
ld1 {v6.d}[1], [x0], x1
|
||||||
|
ld1 {v20.d}[1], [x0], x1
|
||||||
|
ld1 {v18.d}[1], [x0], x1
|
||||||
|
ld1 {v16.d}[1], [x0], x1
|
||||||
|
ld1 {v0.d}[1], [x0], x1
|
||||||
|
ld1 {v2.d}[1], [x0], x1
|
||||||
|
ld1 {v4.d}[1], [x0], x1
|
||||||
|
ld1 {v26.d}[1], [x0], x1
|
||||||
|
|
||||||
|
transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
|
||||||
|
|
||||||
|
h264_loop_filter_luma
|
||||||
|
|
||||||
|
transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #4
|
||||||
|
add x0, x0, #2
|
||||||
|
st1 {v17.s}[0], [x0], x1
|
||||||
|
st1 {v16.s}[0], [x0], x1
|
||||||
|
st1 {v0.s}[0], [x0], x1
|
||||||
|
st1 {v19.s}[0], [x0], x1
|
||||||
|
st1 {v17.s}[1], [x0], x1
|
||||||
|
st1 {v16.s}[1], [x0], x1
|
||||||
|
st1 {v0.s}[1], [x0], x1
|
||||||
|
st1 {v19.s}[1], [x0], x1
|
||||||
|
st1 {v17.s}[2], [x0], x1
|
||||||
|
st1 {v16.s}[2], [x0], x1
|
||||||
|
st1 {v0.s}[2], [x0], x1
|
||||||
|
st1 {v19.s}[2], [x0], x1
|
||||||
|
st1 {v17.s}[3], [x0], x1
|
||||||
|
st1 {v16.s}[3], [x0], x1
|
||||||
|
st1 {v0.s}[3], [x0], x1
|
||||||
|
st1 {v19.s}[3], [x0], x1
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_start_intra
|
||||||
|
orr w4, w2, w3
|
||||||
|
cmp w4, #0
|
||||||
|
b.ne 1f
|
||||||
|
ret
|
||||||
|
1:
|
||||||
|
dup v30.16b, w2 // alpha
|
||||||
|
dup v31.16b, w3 // beta
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro h264_loop_filter_luma_intra
|
||||||
|
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
|
||||||
|
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
|
||||||
|
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
|
||||||
|
cmhi v19.16b, v30.16b, v16.16b // < alpha
|
||||||
|
cmhi v17.16b, v31.16b, v17.16b // < beta
|
||||||
|
cmhi v18.16b, v31.16b, v18.16b // < beta
|
||||||
|
|
||||||
|
movi v29.16b, #2
|
||||||
|
ushr v30.16b, v30.16b, #2 // alpha >> 2
|
||||||
|
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
|
||||||
|
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
|
||||||
|
|
||||||
|
and v19.16b, v19.16b, v17.16b
|
||||||
|
and v19.16b, v19.16b, v18.16b
|
||||||
|
shrn v20.8b, v19.8h, #4
|
||||||
|
mov x4, v20.d[0]
|
||||||
|
cbz x4, 9f
|
||||||
|
|
||||||
|
ushll v20.8h, v6.8b, #1
|
||||||
|
ushll v22.8h, v1.8b, #1
|
||||||
|
ushll2 v21.8h, v6.16b, #1
|
||||||
|
ushll2 v23.8h, v1.16b, #1
|
||||||
|
uaddw v20.8h, v20.8h, v7.8b
|
||||||
|
uaddw v22.8h, v22.8h, v0.8b
|
||||||
|
uaddw2 v21.8h, v21.8h, v7.16b
|
||||||
|
uaddw2 v23.8h, v23.8h, v0.16b
|
||||||
|
uaddw v20.8h, v20.8h, v1.8b
|
||||||
|
uaddw v22.8h, v22.8h, v6.8b
|
||||||
|
uaddw2 v21.8h, v21.8h, v1.16b
|
||||||
|
uaddw2 v23.8h, v23.8h, v6.16b
|
||||||
|
|
||||||
|
rshrn v24.8b, v20.8h, #2 // p0'_1
|
||||||
|
rshrn v25.8b, v22.8h, #2 // q0'_1
|
||||||
|
rshrn2 v24.16b, v21.8h, #2 // p0'_1
|
||||||
|
rshrn2 v25.16b, v23.8h, #2 // q0'_1
|
||||||
|
|
||||||
|
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
|
||||||
|
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
|
||||||
|
cmhi v17.16b, v31.16b, v17.16b // < beta
|
||||||
|
cmhi v18.16b, v31.16b, v18.16b // < beta
|
||||||
|
|
||||||
|
and v17.16b, v16.16b, v17.16b // if_2 && if_3
|
||||||
|
and v18.16b, v16.16b, v18.16b // if_2 && if_4
|
||||||
|
|
||||||
|
not v30.16b, v17.16b
|
||||||
|
not v31.16b, v18.16b
|
||||||
|
|
||||||
|
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
|
||||||
|
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
|
||||||
|
|
||||||
|
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
|
||||||
|
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
|
||||||
|
|
||||||
|
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
|
||||||
|
uaddl v26.8h, v5.8b, v7.8b
|
||||||
|
uaddl2 v27.8h, v5.16b, v7.16b
|
||||||
|
uaddw v26.8h, v26.8h, v0.8b
|
||||||
|
uaddw2 v27.8h, v27.8h, v0.16b
|
||||||
|
add v20.8h, v20.8h, v26.8h
|
||||||
|
add v21.8h, v21.8h, v27.8h
|
||||||
|
uaddw v20.8h, v20.8h, v0.8b
|
||||||
|
uaddw2 v21.8h, v21.8h, v0.16b
|
||||||
|
rshrn v20.8b, v20.8h, #3 // p0'_2
|
||||||
|
rshrn2 v20.16b, v21.8h, #3 // p0'_2
|
||||||
|
uaddw v26.8h, v26.8h, v6.8b
|
||||||
|
uaddw2 v27.8h, v27.8h, v6.16b
|
||||||
|
rshrn v21.8b, v26.8h, #2 // p1'_2
|
||||||
|
rshrn2 v21.16b, v27.8h, #2 // p1'_2
|
||||||
|
uaddl v28.8h, v4.8b, v5.8b
|
||||||
|
uaddl2 v29.8h, v4.16b, v5.16b
|
||||||
|
shl v28.8h, v28.8h, #1
|
||||||
|
shl v29.8h, v29.8h, #1
|
||||||
|
add v28.8h, v28.8h, v26.8h
|
||||||
|
add v29.8h, v29.8h, v27.8h
|
||||||
|
rshrn v19.8b, v28.8h, #3 // p2'_2
|
||||||
|
rshrn2 v19.16b, v29.8h, #3 // p2'_2
|
||||||
|
|
||||||
|
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
|
||||||
|
uaddl v26.8h, v2.8b, v0.8b
|
||||||
|
uaddl2 v27.8h, v2.16b, v0.16b
|
||||||
|
uaddw v26.8h, v26.8h, v7.8b
|
||||||
|
uaddw2 v27.8h, v27.8h, v7.16b
|
||||||
|
add v22.8h, v22.8h, v26.8h
|
||||||
|
add v23.8h, v23.8h, v27.8h
|
||||||
|
uaddw v22.8h, v22.8h, v7.8b
|
||||||
|
uaddw2 v23.8h, v23.8h, v7.16b
|
||||||
|
rshrn v22.8b, v22.8h, #3 // q0'_2
|
||||||
|
rshrn2 v22.16b, v23.8h, #3 // q0'_2
|
||||||
|
uaddw v26.8h, v26.8h, v1.8b
|
||||||
|
uaddw2 v27.8h, v27.8h, v1.16b
|
||||||
|
rshrn v23.8b, v26.8h, #2 // q1'_2
|
||||||
|
rshrn2 v23.16b, v27.8h, #2 // q1'_2
|
||||||
|
uaddl v28.8h, v2.8b, v3.8b
|
||||||
|
uaddl2 v29.8h, v2.16b, v3.16b
|
||||||
|
shl v28.8h, v28.8h, #1
|
||||||
|
shl v29.8h, v29.8h, #1
|
||||||
|
add v28.8h, v28.8h, v26.8h
|
||||||
|
add v29.8h, v29.8h, v27.8h
|
||||||
|
rshrn v26.8b, v28.8h, #3 // q2'_2
|
||||||
|
rshrn2 v26.16b, v29.8h, #3 // q2'_2
|
||||||
|
|
||||||
|
bit v7.16b, v24.16b, v30.16b // p0'_1
|
||||||
|
bit v0.16b, v25.16b, v31.16b // q0'_1
|
||||||
|
bit v7.16b, v20.16b, v17.16b // p0'_2
|
||||||
|
bit v6.16b, v21.16b, v17.16b // p1'_2
|
||||||
|
bit v5.16b, v19.16b, v17.16b // p2'_2
|
||||||
|
bit v0.16b, v22.16b, v18.16b // q0'_2
|
||||||
|
bit v1.16b, v23.16b, v18.16b // q1'_2
|
||||||
|
bit v2.16b, v26.16b, v18.16b // q2'_2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_luma_intra_neon, export=1
|
||||||
|
h264_loop_filter_start_intra
|
||||||
|
|
||||||
|
ld1 {v0.16b}, [x0], x1 // q0
|
||||||
|
ld1 {v1.16b}, [x0], x1 // q1
|
||||||
|
ld1 {v2.16b}, [x0], x1 // q2
|
||||||
|
ld1 {v3.16b}, [x0], x1 // q3
|
||||||
|
sub x0, x0, x1, lsl #3
|
||||||
|
ld1 {v4.16b}, [x0], x1 // p3
|
||||||
|
ld1 {v5.16b}, [x0], x1 // p2
|
||||||
|
ld1 {v6.16b}, [x0], x1 // p1
|
||||||
|
ld1 {v7.16b}, [x0] // p0
|
||||||
|
|
||||||
|
h264_loop_filter_luma_intra
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
st1 {v5.16b}, [x0], x1 // p2
|
||||||
|
st1 {v6.16b}, [x0], x1 // p1
|
||||||
|
st1 {v7.16b}, [x0], x1 // p0
|
||||||
|
st1 {v0.16b}, [x0], x1 // q0
|
||||||
|
st1 {v1.16b}, [x0], x1 // q1
|
||||||
|
st1 {v2.16b}, [x0] // q2
|
||||||
|
9:
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_luma_intra_neon, export=1
|
||||||
|
h264_loop_filter_start_intra
|
||||||
|
|
||||||
|
sub x0, x0, #4
|
||||||
|
ld1 {v4.8b}, [x0], x1
|
||||||
|
ld1 {v5.8b}, [x0], x1
|
||||||
|
ld1 {v6.8b}, [x0], x1
|
||||||
|
ld1 {v7.8b}, [x0], x1
|
||||||
|
ld1 {v0.8b}, [x0], x1
|
||||||
|
ld1 {v1.8b}, [x0], x1
|
||||||
|
ld1 {v2.8b}, [x0], x1
|
||||||
|
ld1 {v3.8b}, [x0], x1
|
||||||
|
ld1 {v4.d}[1], [x0], x1
|
||||||
|
ld1 {v5.d}[1], [x0], x1
|
||||||
|
ld1 {v6.d}[1], [x0], x1
|
||||||
|
ld1 {v7.d}[1], [x0], x1
|
||||||
|
ld1 {v0.d}[1], [x0], x1
|
||||||
|
ld1 {v1.d}[1], [x0], x1
|
||||||
|
ld1 {v2.d}[1], [x0], x1
|
||||||
|
ld1 {v3.d}[1], [x0], x1
|
||||||
|
|
||||||
|
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
|
||||||
|
|
||||||
|
h264_loop_filter_luma_intra
|
||||||
|
|
||||||
|
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #4
|
||||||
|
st1 {v4.8b}, [x0], x1
|
||||||
|
st1 {v5.8b}, [x0], x1
|
||||||
|
st1 {v6.8b}, [x0], x1
|
||||||
|
st1 {v7.8b}, [x0], x1
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
st1 {v1.8b}, [x0], x1
|
||||||
|
st1 {v2.8b}, [x0], x1
|
||||||
|
st1 {v3.8b}, [x0], x1
|
||||||
|
st1 {v4.d}[1], [x0], x1
|
||||||
|
st1 {v5.d}[1], [x0], x1
|
||||||
|
st1 {v6.d}[1], [x0], x1
|
||||||
|
st1 {v7.d}[1], [x0], x1
|
||||||
|
st1 {v0.d}[1], [x0], x1
|
||||||
|
st1 {v1.d}[1], [x0], x1
|
||||||
|
st1 {v2.d}[1], [x0], x1
|
||||||
|
st1 {v3.d}[1], [x0], x1
|
||||||
|
9:
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_chroma
|
||||||
|
dup v22.16b, w2 // alpha
|
||||||
|
uxtl v24.8h, v24.8b
|
||||||
|
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
|
||||||
|
uxtl v4.8h, v0.8b
|
||||||
|
uxtl2 v5.8h, v0.16b
|
||||||
|
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||||
|
usubw v4.8h, v4.8h, v16.8b
|
||||||
|
usubw2 v5.8h, v5.8h, v16.16b
|
||||||
|
sli v24.8h, v24.8h, #8
|
||||||
|
shl v4.8h, v4.8h, #2
|
||||||
|
shl v5.8h, v5.8h, #2
|
||||||
|
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
|
||||||
|
uxtl v24.4s, v24.4h
|
||||||
|
uaddw v4.8h, v4.8h, v18.8b
|
||||||
|
uaddw2 v5.8h, v5.8h, v18.16b
|
||||||
|
cmhi v26.16b, v22.16b, v26.16b // < alpha
|
||||||
|
usubw v4.8h, v4.8h, v2.8b
|
||||||
|
usubw2 v5.8h, v5.8h, v2.16b
|
||||||
|
sli v24.4s, v24.4s, #16
|
||||||
|
dup v22.16b, w3 // beta
|
||||||
|
rshrn v4.8b, v4.8h, #3
|
||||||
|
rshrn2 v4.16b, v5.8h, #3
|
||||||
|
cmhi v28.16b, v22.16b, v28.16b // < beta
|
||||||
|
cmhi v30.16b, v22.16b, v30.16b // < beta
|
||||||
|
smin v4.16b, v4.16b, v24.16b
|
||||||
|
neg v25.16b, v24.16b
|
||||||
|
and v26.16b, v26.16b, v28.16b
|
||||||
|
smax v4.16b, v4.16b, v25.16b
|
||||||
|
and v26.16b, v26.16b, v30.16b
|
||||||
|
uxtl v22.8h, v0.8b
|
||||||
|
uxtl2 v23.8h, v0.16b
|
||||||
|
and v4.16b, v4.16b, v26.16b
|
||||||
|
uxtl v28.8h, v16.8b
|
||||||
|
uxtl2 v29.8h, v16.16b
|
||||||
|
saddw v28.8h, v28.8h, v4.8b
|
||||||
|
saddw2 v29.8h, v29.8h, v4.16b
|
||||||
|
ssubw v22.8h, v22.8h, v4.8b
|
||||||
|
ssubw2 v23.8h, v23.8h, v4.16b
|
||||||
|
sqxtun v16.8b, v28.8h
|
||||||
|
sqxtun v0.8b, v22.8h
|
||||||
|
sqxtun2 v16.16b, v29.8h
|
||||||
|
sqxtun2 v0.16b, v23.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_chroma_neon, export=1
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
ld1 {v18.16b}, [x0], x1
|
||||||
|
ld1 {v16.16b}, [x0], x1
|
||||||
|
ld1 {v0.16b}, [x0], x1
|
||||||
|
ld1 {v2.16b}, [x0]
|
||||||
|
|
||||||
|
h264_loop_filter_chroma
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
st1 {v16.16b}, [x0], x1
|
||||||
|
st1 {v0.16b}, [x0], x1
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_neon, export=1
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub x0, x0, #4
|
||||||
|
deblock_h_chroma:
|
||||||
|
ld1 {v18.d}[0], [x0], x1
|
||||||
|
ld1 {v16.d}[0], [x0], x1
|
||||||
|
ld1 {v0.d}[0], [x0], x1
|
||||||
|
ld1 {v2.d}[0], [x0], x1
|
||||||
|
ld1 {v18.d}[1], [x0], x1
|
||||||
|
ld1 {v16.d}[1], [x0], x1
|
||||||
|
ld1 {v0.d}[1], [x0], x1
|
||||||
|
ld1 {v2.d}[1], [x0], x1
|
||||||
|
|
||||||
|
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
|
||||||
|
|
||||||
|
h264_loop_filter_chroma
|
||||||
|
|
||||||
|
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #3
|
||||||
|
st1 {v18.d}[0], [x0], x1
|
||||||
|
st1 {v16.d}[0], [x0], x1
|
||||||
|
st1 {v0.d}[0], [x0], x1
|
||||||
|
st1 {v2.d}[0], [x0], x1
|
||||||
|
st1 {v18.d}[1], [x0], x1
|
||||||
|
st1 {v16.d}[1], [x0], x1
|
||||||
|
st1 {v0.d}[1], [x0], x1
|
||||||
|
st1 {v2.d}[1], [x0], x1
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_422_neon, export=1
|
||||||
|
add x5, x0, x1
|
||||||
|
sub x0, x0, #4
|
||||||
|
add x1, x1, x1
|
||||||
|
h264_loop_filter_start
|
||||||
|
mov x7, x30
|
||||||
|
bl deblock_h_chroma
|
||||||
|
mov x30, x7
|
||||||
|
sub x0, x5, #4
|
||||||
|
mov v24.s[0], w6
|
||||||
|
b deblock_h_chroma
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_chroma8
|
||||||
|
dup v22.8b, w2 // alpha
|
||||||
|
uxtl v24.8h, v24.8b
|
||||||
|
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
|
||||||
|
uxtl v4.8h, v17.8b
|
||||||
|
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
|
||||||
|
usubw v4.8h, v4.8h, v16.8b
|
||||||
|
sli v24.8h, v24.8h, #8
|
||||||
|
shl v4.8h, v4.8h, #2
|
||||||
|
uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
|
||||||
|
uaddw v4.8h, v4.8h, v18.8b
|
||||||
|
cmhi v26.8b, v22.8b, v26.8b // < alpha
|
||||||
|
usubw v4.8h, v4.8h, v19.8b
|
||||||
|
dup v22.8b, w3 // beta
|
||||||
|
rshrn v4.8b, v4.8h, #3
|
||||||
|
cmhi v28.8b, v22.8b, v28.8b // < beta
|
||||||
|
cmhi v30.8b, v22.8b, v30.8b // < beta
|
||||||
|
smin v4.8b, v4.8b, v24.8b
|
||||||
|
neg v25.8b, v24.8b
|
||||||
|
and v26.8b, v26.8b, v28.8b
|
||||||
|
smax v4.8b, v4.8b, v25.8b
|
||||||
|
and v26.8b, v26.8b, v30.8b
|
||||||
|
uxtl v22.8h, v17.8b
|
||||||
|
and v4.8b, v4.8b, v26.8b
|
||||||
|
uxtl v28.8h, v16.8b
|
||||||
|
saddw v28.8h, v28.8h, v4.8b
|
||||||
|
ssubw v22.8h, v22.8h, v4.8b
|
||||||
|
sqxtun v16.8b, v28.8h
|
||||||
|
sqxtun v17.8b, v22.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_h_chroma_mbaff_neon, export=1
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub x4, x0, #4
|
||||||
|
sub x0, x0, #2
|
||||||
|
|
||||||
|
ld1 {v18.8b}, [x4], x1
|
||||||
|
ld1 {v16.8b}, [x4], x1
|
||||||
|
ld1 {v17.8b}, [x4], x1
|
||||||
|
ld1 {v19.8b}, [x4]
|
||||||
|
|
||||||
|
transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
|
||||||
|
|
||||||
|
h264_loop_filter_chroma8
|
||||||
|
|
||||||
|
st2 {v16.h,v17.h}[0], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[1], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[2], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[3], [x0]
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_chroma_intra width=16
|
||||||
|
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
|
||||||
|
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||||
|
uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
|
||||||
|
cmhi v26.16b, v30.16b, v26.16b // < alpha
|
||||||
|
cmhi v27.16b, v31.16b, v27.16b // < beta
|
||||||
|
cmhi v28.16b, v31.16b, v28.16b // < beta
|
||||||
|
and v26.16b, v26.16b, v27.16b
|
||||||
|
and v26.16b, v26.16b, v28.16b
|
||||||
|
|
||||||
|
ushll v4.8h, v18.8b, #1
|
||||||
|
ushll v6.8h, v19.8b, #1
|
||||||
|
.ifc \width, 16
|
||||||
|
ushll2 v5.8h, v18.16b, #1
|
||||||
|
ushll2 v7.8h, v19.16b, #1
|
||||||
|
uaddl2 v21.8h, v16.16b, v19.16b
|
||||||
|
uaddl2 v23.8h, v17.16b, v18.16b
|
||||||
|
.endif
|
||||||
|
uaddl v20.8h, v16.8b, v19.8b
|
||||||
|
uaddl v22.8h, v17.8b, v18.8b
|
||||||
|
add v20.8h, v20.8h, v4.8h // mlal?
|
||||||
|
add v22.8h, v22.8h, v6.8h
|
||||||
|
.ifc \width, 16
|
||||||
|
add v21.8h, v21.8h, v5.8h
|
||||||
|
add v23.8h, v23.8h, v7.8h
|
||||||
|
.endif
|
||||||
|
uqrshrn v24.8b, v20.8h, #2
|
||||||
|
uqrshrn v25.8b, v22.8h, #2
|
||||||
|
.ifc \width, 16
|
||||||
|
uqrshrn2 v24.16b, v21.8h, #2
|
||||||
|
uqrshrn2 v25.16b, v23.8h, #2
|
||||||
|
.endif
|
||||||
|
bit v16.16b, v24.16b, v26.16b
|
||||||
|
bit v17.16b, v25.16b, v26.16b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_chroma_intra_neon, export=1
|
||||||
|
h264_loop_filter_start_intra
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
ld1 {v18.16b}, [x0], x1
|
||||||
|
ld1 {v16.16b}, [x0], x1
|
||||||
|
ld1 {v17.16b}, [x0], x1
|
||||||
|
ld1 {v19.16b}, [x0]
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra
|
||||||
|
|
||||||
|
sub x0, x0, x1, lsl #1
|
||||||
|
st1 {v16.16b}, [x0], x1
|
||||||
|
st1 {v17.16b}, [x0], x1
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_intra_mbaff_neon, export=1
|
||||||
|
h264_loop_filter_start_intra
|
||||||
|
|
||||||
|
sub x4, x0, #4
|
||||||
|
sub x0, x0, #2
|
||||||
|
ld1 {v18.8b}, [x4], x1
|
||||||
|
ld1 {v16.8b}, [x4], x1
|
||||||
|
ld1 {v17.8b}, [x4], x1
|
||||||
|
ld1 {v19.8b}, [x4], x1
|
||||||
|
|
||||||
|
transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra width=8
|
||||||
|
|
||||||
|
st2 {v16.h,v17.h}[0], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[1], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[2], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[3], [x0], x1
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_intra_neon, export=1
|
||||||
|
h264_loop_filter_start_intra
|
||||||
|
|
||||||
|
sub x4, x0, #4
|
||||||
|
sub x0, x0, #2
|
||||||
|
ld1 {v18.d}[0], [x4], x1
|
||||||
|
ld1 {v16.d}[0], [x4], x1
|
||||||
|
ld1 {v17.d}[0], [x4], x1
|
||||||
|
ld1 {v19.d}[0], [x4], x1
|
||||||
|
ld1 {v18.d}[1], [x4], x1
|
||||||
|
ld1 {v16.d}[1], [x4], x1
|
||||||
|
ld1 {v17.d}[1], [x4], x1
|
||||||
|
ld1 {v19.d}[1], [x4], x1
|
||||||
|
|
||||||
|
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra
|
||||||
|
|
||||||
|
st2 {v16.h,v17.h}[0], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[1], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[2], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[3], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[4], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[5], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[6], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[7], [x0], x1
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_422_intra_neon, export=1
|
||||||
|
h264_loop_filter_start_intra
|
||||||
|
|
||||||
|
sub x4, x0, #4
|
||||||
|
sub x0, x0, #2
|
||||||
|
ld1 {v18.d}[0], [x4], x1
|
||||||
|
ld1 {v16.d}[0], [x4], x1
|
||||||
|
ld1 {v17.d}[0], [x4], x1
|
||||||
|
ld1 {v19.d}[0], [x4], x1
|
||||||
|
ld1 {v18.d}[1], [x4], x1
|
||||||
|
ld1 {v16.d}[1], [x4], x1
|
||||||
|
ld1 {v17.d}[1], [x4], x1
|
||||||
|
ld1 {v19.d}[1], [x4], x1
|
||||||
|
|
||||||
|
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra
|
||||||
|
|
||||||
|
st2 {v16.h,v17.h}[0], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[1], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[2], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[3], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[4], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[5], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[6], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[7], [x0], x1
|
||||||
|
|
||||||
|
ld1 {v18.d}[0], [x4], x1
|
||||||
|
ld1 {v16.d}[0], [x4], x1
|
||||||
|
ld1 {v17.d}[0], [x4], x1
|
||||||
|
ld1 {v19.d}[0], [x4], x1
|
||||||
|
ld1 {v18.d}[1], [x4], x1
|
||||||
|
ld1 {v16.d}[1], [x4], x1
|
||||||
|
ld1 {v17.d}[1], [x4], x1
|
||||||
|
ld1 {v19.d}[1], [x4], x1
|
||||||
|
|
||||||
|
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra
|
||||||
|
|
||||||
|
st2 {v16.h,v17.h}[0], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[1], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[2], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[3], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[4], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[5], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[6], [x0], x1
|
||||||
|
st2 {v16.h,v17.h}[7], [x0], x1
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// void deblock_strength( uint8_t nnz[X264_SCAN8_SIZE],
|
||||||
|
// int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
|
||||||
|
// uint8_t bs[2][8][4], int mvy_limit,
|
||||||
|
// int bframe )
|
||||||
|
function deblock_strength_neon, export=1
|
||||||
|
movi v4.16b, #0
|
||||||
|
lsl w4, w4, #8
|
||||||
|
add x3, x3, #32
|
||||||
|
sub w4, w4, #(1<<8)-3
|
||||||
|
movi v5.16b, #0
|
||||||
|
dup v6.8h, w4
|
||||||
|
mov x6, #-32
|
||||||
|
|
||||||
|
bframe:
|
||||||
|
// load bytes ref
|
||||||
|
add x2, x2, #16
|
||||||
|
ld1 {v31.d}[1], [x1], #8
|
||||||
|
ld1 {v1.16b}, [x1], #16
|
||||||
|
movi v0.16b, #0
|
||||||
|
ld1 {v2.16b}, [x1], #16
|
||||||
|
ext v3.16b, v0.16b, v1.16b, #15
|
||||||
|
ext v0.16b, v0.16b, v2.16b, #15
|
||||||
|
unzip v21.4s, v22.4s, v1.4s, v2.4s
|
||||||
|
unzip v23.4s, v20.4s, v3.4s, v0.4s
|
||||||
|
ext v21.16b, v31.16b, v22.16b, #12
|
||||||
|
|
||||||
|
eor v0.16b, v20.16b, v22.16b
|
||||||
|
eor v1.16b, v21.16b, v22.16b
|
||||||
|
orr v4.16b, v4.16b, v0.16b
|
||||||
|
orr v5.16b, v5.16b, v1.16b
|
||||||
|
|
||||||
|
ld1 {v21.8h}, [x2], #16 // mv + 0x10
|
||||||
|
ld1 {v19.8h}, [x2], #16 // mv + 0x20
|
||||||
|
ld1 {v22.8h}, [x2], #16 // mv + 0x30
|
||||||
|
ld1 {v18.8h}, [x2], #16 // mv + 0x40
|
||||||
|
ld1 {v23.8h}, [x2], #16 // mv + 0x50
|
||||||
|
ext v19.16b, v19.16b, v22.16b, #12
|
||||||
|
ext v18.16b, v18.16b, v23.16b, #12
|
||||||
|
sabd v0.8h, v22.8h, v19.8h
|
||||||
|
ld1 {v19.8h}, [x2], #16 // mv + 0x60
|
||||||
|
sabd v1.8h, v23.8h, v18.8h
|
||||||
|
ld1 {v24.8h}, [x2], #16 // mv + 0x70
|
||||||
|
uqxtn v0.8b, v0.8h
|
||||||
|
ld1 {v18.8h}, [x2], #16 // mv + 0x80
|
||||||
|
ld1 {v25.8h}, [x2], #16 // mv + 0x90
|
||||||
|
uqxtn2 v0.16b, v1.8h
|
||||||
|
ext v19.16b, v19.16b, v24.16b, #12
|
||||||
|
ext v18.16b, v18.16b, v25.16b, #12
|
||||||
|
sabd v1.8h, v24.8h, v19.8h
|
||||||
|
sabd v2.8h, v25.8h, v18.8h
|
||||||
|
uqxtn v1.8b, v1.8h
|
||||||
|
uqxtn2 v1.16b, v2.8h
|
||||||
|
|
||||||
|
uqsub v0.16b, v0.16b, v6.16b
|
||||||
|
uqsub v1.16b, v1.16b, v6.16b
|
||||||
|
uqxtn v0.8b, v0.8h
|
||||||
|
uqxtn2 v0.16b, v1.8h
|
||||||
|
|
||||||
|
sabd v1.8h, v22.8h, v23.8h
|
||||||
|
orr v4.16b, v4.16b, v0.16b
|
||||||
|
|
||||||
|
sabd v0.8h, v21.8h, v22.8h
|
||||||
|
sabd v2.8h, v23.8h, v24.8h
|
||||||
|
sabd v3.8h, v24.8h, v25.8h
|
||||||
|
uqxtn v0.8b, v0.8h
|
||||||
|
uqxtn2 v0.16b, v1.8h
|
||||||
|
uqxtn v1.8b, v2.8h
|
||||||
|
uqxtn2 v1.16b, v3.8h
|
||||||
|
|
||||||
|
uqsub v0.16b, v0.16b, v6.16b
|
||||||
|
uqsub v1.16b, v1.16b, v6.16b
|
||||||
|
uqxtn v0.8b, v0.8h
|
||||||
|
uqxtn2 v0.16b, v1.8h
|
||||||
|
subs w5, w5, #1
|
||||||
|
orr v5.16b, v5.16b, v0.16b
|
||||||
|
b.eq bframe
|
||||||
|
|
||||||
|
movi v6.16b, #1
|
||||||
|
// load bytes nnz
|
||||||
|
ld1 {v31.d}[1], [x0], #8
|
||||||
|
ld1 {v1.16b}, [x0], #16
|
||||||
|
movi v0.16b, #0
|
||||||
|
ld1 {v2.16b}, [x0], #16
|
||||||
|
ext v3.16b, v0.16b, v1.16b, #15
|
||||||
|
ext v0.16b, v0.16b, v2.16b, #15
|
||||||
|
unzip v21.4s, v22.4s, v1.4s, v2.4s
|
||||||
|
unzip v23.4s, v20.4s, v3.4s, v0.4s
|
||||||
|
ext v21.16b, v31.16b, v22.16b, #12
|
||||||
|
|
||||||
|
movrel x7, transpose_table
|
||||||
|
ld1 {v7.16b}, [x7]
|
||||||
|
orr v0.16b, v20.16b, v22.16b
|
||||||
|
orr v1.16b, v21.16b, v22.16b
|
||||||
|
umin v0.16b, v0.16b, v6.16b
|
||||||
|
umin v1.16b, v1.16b, v6.16b
|
||||||
|
umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
|
||||||
|
umin v5.16b, v5.16b, v6.16b
|
||||||
|
add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
|
||||||
|
add v1.16b, v1.16b, v1.16b
|
||||||
|
umax v4.16b, v4.16b, v0.16b
|
||||||
|
umax v5.16b, v5.16b, v1.16b
|
||||||
|
tbl v6.16b, {v4.16b}, v7.16b
|
||||||
|
st1 {v5.16b}, [x3], x6 // bs[1]
|
||||||
|
st1 {v6.16b}, [x3] // bs[0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
const transpose_table
|
||||||
|
.byte 0, 4, 8, 12
|
||||||
|
.byte 1, 5, 9, 13
|
||||||
|
.byte 2, 6, 10, 14
|
||||||
|
.byte 3, 7, 11, 15
|
||||||
|
endconst
|
||||||
61
common/aarch64/deblock.h
Normal file
61
common/aarch64/deblock.h
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock.h: aarch64 deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2017-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_DEBLOCK_H
|
||||||
|
#define X264_AARCH64_DEBLOCK_H
|
||||||
|
|
||||||
|
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
|
||||||
|
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
|
||||||
|
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
|
||||||
|
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
|
||||||
|
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
|
||||||
|
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||||
|
int mvy_limit, int bframe );
|
||||||
|
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
|
||||||
|
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
|
||||||
|
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
|
||||||
|
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
|
||||||
|
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
|
||||||
|
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
|
||||||
|
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
|
||||||
|
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
|
||||||
|
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
|
||||||
|
#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
|
||||||
|
void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
|
||||||
|
#endif
|
||||||
66
common/aarch64/mc-a-common.S
Normal file
66
common/aarch64/mc-a-common.S
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* mc-a-common.S: aarch64 motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
* Mans Rullgard <mans@mansr.com>
|
||||||
|
* Stefan Groenroos <stefan.gronroos@gmail.com>
|
||||||
|
* David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
// This file contains the NEON macros and functions that are intended to be used by
|
||||||
|
// the SVE/SVE2 functions as well
|
||||||
|
|
||||||
|
#if BIT_DEPTH == 8
|
||||||
|
|
||||||
|
// 0 < weight < 64
|
||||||
|
.macro load_weights_add_add
|
||||||
|
mov w6, w6
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// weight > 64
|
||||||
|
.macro load_weights_add_sub
|
||||||
|
neg w7, w7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// weight < 0
|
||||||
|
.macro load_weights_sub_add
|
||||||
|
neg w6, w6
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function pixel_avg_w4_neon
|
||||||
|
1: subs w9, w9, #2
|
||||||
|
ld1 {v0.s}[0], [x2], x3
|
||||||
|
ld1 {v2.s}[0], [x4], x5
|
||||||
|
urhadd v0.8b, v0.8b, v2.8b
|
||||||
|
ld1 {v1.s}[0], [x2], x3
|
||||||
|
ld1 {v3.s}[0], [x4], x5
|
||||||
|
urhadd v1.8b, v1.8b, v3.8b
|
||||||
|
st1 {v0.s}[0], [x0], x1
|
||||||
|
st1 {v1.s}[0], [x0], x1
|
||||||
|
b.gt 1b
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
#else // BIT_DEPTH == 10
|
||||||
|
|
||||||
|
#endif
|
||||||
108
common/aarch64/mc-a-sve.S
Normal file
108
common/aarch64/mc-a-sve.S
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc-a-sve.S: aarch64 motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "mc-a-common.S"
|
||||||
|
|
||||||
|
ENABLE_SVE
|
||||||
|
|
||||||
|
#if BIT_DEPTH == 8
|
||||||
|
|
||||||
|
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
|
||||||
|
// uint8_t *src1, intptr_t src1_stride,
|
||||||
|
// uint8_t *src2, intptr_t src2_stride, int weight );
|
||||||
|
.macro AVGH_SVE w h
|
||||||
|
function pixel_avg_\w\()x\h\()_sve, export=1
|
||||||
|
mov w10, #64
|
||||||
|
cmp w6, #32
|
||||||
|
mov w9, #\h
|
||||||
|
b.eq pixel_avg_w\w\()_neon
|
||||||
|
subs w7, w10, w6
|
||||||
|
b.lt pixel_avg_weight_w\w\()_add_sub_sve // weight > 64
|
||||||
|
cmp w6, #0
|
||||||
|
b.ge pixel_avg_weight_w\w\()_add_add_sve
|
||||||
|
b pixel_avg_weight_w\w\()_sub_add_sve // weight < 0
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
AVGH_SVE 4, 2
|
||||||
|
AVGH_SVE 4, 4
|
||||||
|
AVGH_SVE 4, 8
|
||||||
|
AVGH_SVE 4, 16
|
||||||
|
|
||||||
|
// 0 < weight < 64
|
||||||
|
.macro weight_add_add_sve dst, s1, s2, h=
|
||||||
|
mul \dst, \s1, v30.8h
|
||||||
|
mla \dst, \s2, v31.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// weight > 64
|
||||||
|
.macro weight_add_sub_sve dst, s1, s2, h=
|
||||||
|
mul \dst, \s1, v30.8h
|
||||||
|
mls \dst, \s2, v31.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// weight < 0
|
||||||
|
.macro weight_sub_add_sve dst, s1, s2, h=
|
||||||
|
mul \dst, \s2, v31.8h
|
||||||
|
mls \dst, \s1, v30.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro AVG_WEIGHT_SVE ext
|
||||||
|
function pixel_avg_weight_w4_\ext\()_sve
|
||||||
|
load_weights_\ext
|
||||||
|
ptrue p0.b, vl8
|
||||||
|
dup v30.8h, w6
|
||||||
|
dup v31.8h, w7
|
||||||
|
1: // height loop
|
||||||
|
subs w9, w9, #2
|
||||||
|
ld1b {z0.h}, p0/z, [x2]
|
||||||
|
add x2, x2, x3
|
||||||
|
ld1b {z1.h}, p0/z, [x4]
|
||||||
|
add x4, x4, x5
|
||||||
|
weight_\ext\()_sve v4.8h, v0.8h, v1.8h
|
||||||
|
ld1b {z2.h}, p0/z, [x2]
|
||||||
|
add x2, x2, x3
|
||||||
|
ld1b {z3.h}, p0/z, [x4]
|
||||||
|
add x4, x4, x5
|
||||||
|
|
||||||
|
sqrshrun v0.8b, v4.8h, #6
|
||||||
|
weight_\ext\()_sve v5.8h, v2.8h, v3.8h
|
||||||
|
st1 {v0.s}[0], [x0], x1
|
||||||
|
sqrshrun v1.8b, v5.8h, #6
|
||||||
|
st1 {v1.s}[0], [x0], x1
|
||||||
|
b.gt 1b
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
AVG_WEIGHT_SVE add_add
|
||||||
|
AVG_WEIGHT_SVE add_sub
|
||||||
|
AVG_WEIGHT_SVE sub_add
|
||||||
|
|
||||||
|
#else // BIT_DEPTH == 10
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
3935
common/aarch64/mc-a.S
Normal file
3935
common/aarch64/mc-a.S
Normal file
File diff suppressed because it is too large
Load Diff
371
common/aarch64/mc-c.c
Normal file
371
common/aarch64/mc-c.c
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc-c.c: aarch64 motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "mc.h"
|
||||||
|
|
||||||
|
#define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
|
||||||
|
void x264_prefetch_ref_aarch64( pixel *, intptr_t, int );
|
||||||
|
#define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
|
||||||
|
void x264_prefetch_fenc_420_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
|
||||||
|
void x264_prefetch_fenc_422_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
|
||||||
|
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
|
||||||
|
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
|
||||||
|
void x264_memzero_aligned_neon( void *dst, size_t n );
|
||||||
|
|
||||||
|
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
|
||||||
|
void x264_pixel_avg_16x16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
|
||||||
|
void x264_pixel_avg_16x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
|
||||||
|
void x264_pixel_avg_8x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
|
||||||
|
void x264_pixel_avg_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
|
||||||
|
void x264_pixel_avg_8x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
|
||||||
|
void x264_pixel_avg_4x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
|
||||||
|
void x264_pixel_avg_4x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
|
||||||
|
void x264_pixel_avg_4x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
|
||||||
|
void x264_pixel_avg_4x2_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve)
|
||||||
|
void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve)
|
||||||
|
void x264_pixel_avg_4x8_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve)
|
||||||
|
void x264_pixel_avg_4x4_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve)
|
||||||
|
void x264_pixel_avg_4x2_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
|
||||||
|
void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||||
|
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
|
||||||
|
void x264_pixel_avg2_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||||
|
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
|
||||||
|
void x264_pixel_avg2_w16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||||
|
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
|
||||||
|
void x264_pixel_avg2_w20_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||||
|
|
||||||
|
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
|
||||||
|
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
|
||||||
|
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
|
||||||
|
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
|
||||||
|
pixel *dstv, intptr_t i_dstv,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
|
||||||
|
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
|
||||||
|
pixel *dstb, intptr_t i_dstb,
|
||||||
|
pixel *dstc, intptr_t i_dstc,
|
||||||
|
pixel *src, intptr_t i_src, int pw, int w, int h );
|
||||||
|
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
|
||||||
|
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *srcu, intptr_t i_srcu,
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||||
|
|
||||||
|
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
|
||||||
|
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||||
|
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
|
||||||
|
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
|
||||||
|
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
|
||||||
|
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
|
||||||
|
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
|
||||||
|
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
|
||||||
|
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
|
||||||
|
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
|
||||||
|
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
|
||||||
|
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
|
||||||
|
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
|
||||||
|
#define MC_WEIGHT(func)\
|
||||||
|
void x264_mc_weight_w20##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
void x264_mc_weight_w16##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
void x264_mc_weight_w8##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
void x264_mc_weight_w4##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
\
|
||||||
|
static void (* mc##func##_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
|
||||||
|
{\
|
||||||
|
x264_mc_weight_w4##func##_neon,\
|
||||||
|
x264_mc_weight_w4##func##_neon,\
|
||||||
|
x264_mc_weight_w8##func##_neon,\
|
||||||
|
x264_mc_weight_w16##func##_neon,\
|
||||||
|
x264_mc_weight_w16##func##_neon,\
|
||||||
|
x264_mc_weight_w20##func##_neon,\
|
||||||
|
};
|
||||||
|
|
||||||
|
MC_WEIGHT()
|
||||||
|
MC_WEIGHT(_nodenom)
|
||||||
|
MC_WEIGHT(_offsetadd)
|
||||||
|
MC_WEIGHT(_offsetsub)
|
||||||
|
|
||||||
|
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
|
||||||
|
void x264_mc_copy_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
|
||||||
|
void x264_mc_copy_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
|
||||||
|
void x264_mc_copy_w16_neon( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
|
||||||
|
void x264_mc_chroma_neon( pixel *, pixel *, intptr_t, pixel *, intptr_t, int, int, int, int );
|
||||||
|
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
|
||||||
|
void x264_integral_init4h_neon( uint16_t *, pixel *, intptr_t );
|
||||||
|
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
|
||||||
|
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
|
||||||
|
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
|
||||||
|
void x264_integral_init8h_neon( uint16_t *, pixel *, intptr_t );
|
||||||
|
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
|
||||||
|
void x264_integral_init8v_neon( uint16_t *, intptr_t );
|
||||||
|
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
|
||||||
|
void x264_frame_init_lowres_core_neon( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, intptr_t, int, int );
|
||||||
|
|
||||||
|
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
|
||||||
|
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
|
||||||
|
|
||||||
|
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
|
||||||
|
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
|
||||||
|
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
|
||||||
|
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
|
||||||
|
|
||||||
|
static void (* const pixel_avg_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_pixel_avg2_w4_neon,
|
||||||
|
x264_pixel_avg2_w8_neon,
|
||||||
|
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
|
||||||
|
x264_pixel_avg2_w16_neon,
|
||||||
|
x264_pixel_avg2_w20_neon,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void (* const mc_copy_wtab_neon[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w4_neon,
|
||||||
|
x264_mc_copy_w8_neon,
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w16_neon,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
|
||||||
|
{
|
||||||
|
if( w->i_scale == 1<<w->i_denom )
|
||||||
|
{
|
||||||
|
if( w->i_offset < 0 )
|
||||||
|
{
|
||||||
|
w->weightfn = mc_offsetsub_wtab_neon;
|
||||||
|
w->cachea[0] = -w->i_offset;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
w->weightfn = mc_offsetadd_wtab_neon;
|
||||||
|
w->cachea[0] = w->i_offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( !w->i_denom )
|
||||||
|
w->weightfn = mc_nodenom_wtab_neon;
|
||||||
|
else
|
||||||
|
w->weightfn = mc_wtab_neon;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mc_luma_neon( pixel *dst, intptr_t i_dst_stride,
|
||||||
|
pixel *src[4], intptr_t i_src_stride,
|
||||||
|
int mvx, int mvy,
|
||||||
|
int i_width, int i_height, const x264_weight_t *weight )
|
||||||
|
{
|
||||||
|
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||||
|
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||||
|
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||||
|
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||||
|
src1 += i_src_stride;
|
||||||
|
|
||||||
|
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||||
|
{
|
||||||
|
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||||
|
pixel_avg_wtab_neon[i_width>>2](
|
||||||
|
dst, i_dst_stride, src1, i_src_stride,
|
||||||
|
src2, i_height );
|
||||||
|
if( weight->weightfn )
|
||||||
|
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
|
||||||
|
}
|
||||||
|
else if( weight->weightfn )
|
||||||
|
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||||
|
else
|
||||||
|
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
|
||||||
|
}
|
||||||
|
|
||||||
|
static pixel *get_ref_neon( pixel *dst, intptr_t *i_dst_stride,
|
||||||
|
pixel *src[4], intptr_t i_src_stride,
|
||||||
|
int mvx, int mvy,
|
||||||
|
int i_width, int i_height, const x264_weight_t *weight )
|
||||||
|
{
|
||||||
|
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||||
|
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||||
|
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||||
|
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||||
|
src1 += i_src_stride;
|
||||||
|
|
||||||
|
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||||
|
{
|
||||||
|
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||||
|
pixel_avg_wtab_neon[i_width>>2](
|
||||||
|
dst, *i_dst_stride, src1, i_src_stride,
|
||||||
|
src2, i_height );
|
||||||
|
if( weight->weightfn )
|
||||||
|
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
else if( weight->weightfn )
|
||||||
|
{
|
||||||
|
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*i_dst_stride = i_src_stride;
|
||||||
|
return src1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define x264_hpel_filter_neon x264_template(hpel_filter_neon)
|
||||||
|
void x264_hpel_filter_neon( pixel *dsth, pixel *dstv, pixel *dstc,
|
||||||
|
pixel *src, intptr_t stride, int width,
|
||||||
|
int height, int16_t *buf );
|
||||||
|
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH && HAVE_I8MM
|
||||||
|
#define x264_hpel_filter_neon_i8mm x264_template(hpel_filter_neon_i8mm)
|
||||||
|
void x264_hpel_filter_neon_i8mm( pixel *dsth, pixel *dstv, pixel *dstc,
|
||||||
|
pixel *src, intptr_t stride, int width,
|
||||||
|
int height, int16_t *buf );
|
||||||
|
#endif // !HIGH_BIT_DEPTH && HAVE_I8MM
|
||||||
|
|
||||||
|
PLANE_COPY(16, neon)
|
||||||
|
PLANE_COPY_SWAP(16, neon)
|
||||||
|
PLANE_INTERLEAVE(neon)
|
||||||
|
PROPAGATE_LIST(neon)
|
||||||
|
|
||||||
|
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf )
|
||||||
|
{
|
||||||
|
|
||||||
|
if( cpu&X264_CPU_ARMV8 )
|
||||||
|
{
|
||||||
|
pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
|
||||||
|
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
|
||||||
|
pf->prefetch_ref = x264_prefetch_ref_aarch64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cpu&X264_CPU_NEON )
|
||||||
|
{
|
||||||
|
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
|
||||||
|
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
|
||||||
|
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
|
||||||
|
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
|
||||||
|
|
||||||
|
pf->memcpy_aligned = x264_memcpy_aligned_neon;
|
||||||
|
pf->memzero_aligned = x264_memzero_aligned_neon;
|
||||||
|
|
||||||
|
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
|
||||||
|
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
|
||||||
|
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
|
||||||
|
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
|
||||||
|
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
|
||||||
|
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
|
||||||
|
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
|
||||||
|
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
|
||||||
|
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
|
||||||
|
|
||||||
|
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
|
||||||
|
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
|
||||||
|
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
|
||||||
|
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
|
||||||
|
|
||||||
|
pf->weight = mc_wtab_neon;
|
||||||
|
pf->offsetadd = mc_offsetadd_wtab_neon;
|
||||||
|
pf->offsetsub = mc_offsetsub_wtab_neon;
|
||||||
|
pf->weight_cache = weight_cache_neon;
|
||||||
|
|
||||||
|
pf->mc_chroma = x264_mc_chroma_neon;
|
||||||
|
pf->mc_luma = mc_luma_neon;
|
||||||
|
pf->get_ref = get_ref_neon;
|
||||||
|
|
||||||
|
pf->integral_init4h = x264_integral_init4h_neon;
|
||||||
|
pf->integral_init8h = x264_integral_init8h_neon;
|
||||||
|
pf->integral_init4v = x264_integral_init4v_neon;
|
||||||
|
pf->integral_init8v = x264_integral_init8v_neon;
|
||||||
|
|
||||||
|
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
|
||||||
|
|
||||||
|
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
|
||||||
|
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
|
||||||
|
|
||||||
|
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
|
||||||
|
|
||||||
|
pf->plane_copy = plane_copy_neon;
|
||||||
|
pf->plane_copy_swap = plane_copy_swap_neon;
|
||||||
|
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
|
||||||
|
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
|
||||||
|
pf->plane_copy_interleave = plane_copy_interleave_neon;
|
||||||
|
|
||||||
|
pf->hpel_filter = x264_hpel_filter_neon;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
#if HAVE_SVE
|
||||||
|
if( cpu&X264_CPU_SVE )
|
||||||
|
{
|
||||||
|
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sve;
|
||||||
|
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sve;
|
||||||
|
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sve;
|
||||||
|
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sve;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_I8MM
|
||||||
|
if( cpu&X264_CPU_I8MM )
|
||||||
|
{
|
||||||
|
pf->hpel_filter = x264_hpel_filter_neon_i8mm;
|
||||||
|
}
|
||||||
|
#endif // HAVE_I8MM
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
32
common/aarch64/mc.h
Normal file
32
common/aarch64/mc.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc.h: aarch64 motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2014-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_MC_H
|
||||||
|
#define X264_AARCH64_MC_H
|
||||||
|
|
||||||
|
#define x264_mc_init_aarch64 x264_template(mc_init_aarch64)
|
||||||
|
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf );
|
||||||
|
|
||||||
|
#endif
|
||||||
44
common/aarch64/pixel-a-common.S
Normal file
44
common/aarch64/pixel-a-common.S
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* pixel-a-common.S: aarch64 pixel metrics
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
* David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
// This file contains the NEON macros and constants that are intended to be used by
|
||||||
|
// the SVE/SVE2 functions as well
|
||||||
|
|
||||||
|
const mask_ac_4_8
|
||||||
|
.short 0, -1, -1, -1, 0, -1, -1, -1
|
||||||
|
.short 0, -1, -1, -1, -1, -1, -1, -1
|
||||||
|
endconst
|
||||||
|
|
||||||
|
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
|
||||||
|
SUMSUB_AB \s1, \d1, \a, \b
|
||||||
|
SUMSUB_AB \s2, \d2, \c, \d
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
|
||||||
|
SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
|
||||||
|
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
|
||||||
|
.endm
|
||||||
523
common/aarch64/pixel-a-sve.S
Normal file
523
common/aarch64/pixel-a-sve.S
Normal file
@@ -0,0 +1,523 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* pixel-a-sve.S: aarch64 pixel metrics
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Chen <david.chen@myais.com.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
#include "pixel-a-common.S"
|
||||||
|
|
||||||
|
ENABLE_SVE
|
||||||
|
|
||||||
|
#if BIT_DEPTH == 8
|
||||||
|
|
||||||
|
.macro SSD_START_SVE_4
|
||||||
|
ptrue p0.h, vl4
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
ld1b {z17.h}, p0/z, [x2]
|
||||||
|
add x0, x0, x1
|
||||||
|
add x2, x2, x3
|
||||||
|
sub v2.4h, v16.4h, v17.4h
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
ld1b {z17.h}, p0/z, [x2]
|
||||||
|
add x0, x0, x1
|
||||||
|
add x2, x2, x3
|
||||||
|
smull v0.4s, v2.4h, v2.4h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_SVE_4
|
||||||
|
sub v2.4h, v16.4h, v17.4h
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
ld1b {z17.h}, p0/z, [x2]
|
||||||
|
add x0, x0, x1
|
||||||
|
add x2, x2, x3
|
||||||
|
smlal v0.4s, v2.4h, v2.4h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_END_SVE_4
|
||||||
|
sub v2.4h, v16.4h, v17.4h
|
||||||
|
smlal v0.4s, v2.4h, v2.4h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_START_SVE_8
|
||||||
|
ptrue p0.h, vl8
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
ld1b {z17.h}, p0/z, [x2]
|
||||||
|
add x0, x0, x1
|
||||||
|
add x2, x2, x3
|
||||||
|
sub v2.8h, v16.8h, v17.8h
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
smull v0.4s, v2.4h, v2.4h
|
||||||
|
ld1b {z17.h}, p0/z, [x2]
|
||||||
|
smlal2 v0.4s, v2.8h, v2.8h
|
||||||
|
add x0, x0, x1
|
||||||
|
add x2, x2, x3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_SVE_8
|
||||||
|
sub v2.8h, v16.8h, v17.8h
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
smlal v0.4s, v2.4h, v2.4h
|
||||||
|
ld1b {z17.h}, p0/z, [x2]
|
||||||
|
smlal2 v0.4s, v2.8h, v2.8h
|
||||||
|
add x0, x0, x1
|
||||||
|
add x2, x2, x3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_END_SVE_8
|
||||||
|
sub v2.8h, v16.8h, v17.8h
|
||||||
|
smlal v0.4s, v2.4h, v2.4h
|
||||||
|
smlal2 v0.4s, v2.8h, v2.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_FUNC_SVE w h
|
||||||
|
function pixel_ssd_\w\()x\h\()_sve, export=1
|
||||||
|
SSD_START_SVE_\w
|
||||||
|
.rept \h-2
|
||||||
|
SSD_SVE_\w
|
||||||
|
.endr
|
||||||
|
SSD_END_SVE_\w
|
||||||
|
|
||||||
|
addv s0, v0.4s
|
||||||
|
mov w0, v0.s[0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro load_diff_fly_sve_8x8
|
||||||
|
ld1b {z1.h}, p0/z, [x2]
|
||||||
|
ld1b {z0.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z3.h}, p0/z, [x2]
|
||||||
|
ld1b {z2.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
sub v16.8h, v0.8h, v1.8h
|
||||||
|
sub v17.8h, v2.8h, v3.8h
|
||||||
|
ld1b {z5.h}, p0/z, [x2]
|
||||||
|
ld1b {z4.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z7.h}, p0/z, [x2]
|
||||||
|
ld1b {z6.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
sub v18.8h, v4.8h, v5.8h
|
||||||
|
sub v19.8h, v6.8h, v7.8h
|
||||||
|
ld1b {z1.h}, p0/z, [x2]
|
||||||
|
ld1b {z0.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z3.h}, p0/z, [x2]
|
||||||
|
ld1b {z2.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
sub v20.8h, v0.8h, v1.8h
|
||||||
|
sub v21.8h, v2.8h, v3.8h
|
||||||
|
ld1b {z5.h}, p0/z, [x2]
|
||||||
|
ld1b {z4.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z7.h}, p0/z, [x2]
|
||||||
|
ld1b {z6.h}, p0/z, [x0]
|
||||||
|
add x2, x2, x3
|
||||||
|
add x0, x0, x1
|
||||||
|
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
|
||||||
|
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
|
||||||
|
|
||||||
|
sub v22.8h, v4.8h, v5.8h
|
||||||
|
sub v23.8h, v6.8h, v7.8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro pixel_var_sve_8 h
|
||||||
|
function pixel_var_8x\h\()_sve, export=1
|
||||||
|
ptrue p0.h, vl8
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z17.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
mov x2, \h - 4
|
||||||
|
mul v1.8h, v16.8h, v16.8h
|
||||||
|
mul v2.8h, v17.8h, v17.8h
|
||||||
|
add v0.8h, v16.8h, v17.8h
|
||||||
|
ld1b {z18.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
uaddlp v1.4s, v1.8h
|
||||||
|
uaddlp v2.4s, v2.8h
|
||||||
|
ld1b {z19.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
|
||||||
|
1: subs x2, x2, #4
|
||||||
|
add v0.8h, v0.8h, v18.8h
|
||||||
|
mul v24.8h, v18.8h, v18.8h
|
||||||
|
ld1b {z20.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
add v0.8h, v0.8h, v19.8h
|
||||||
|
mul v25.8h, v19.8h, v19.8h
|
||||||
|
uadalp v1.4s, v24.8h
|
||||||
|
ld1b {z21.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
add v0.8h, v0.8h, v20.8h
|
||||||
|
mul v26.8h, v20.8h, v20.8h
|
||||||
|
uadalp v2.4s, v25.8h
|
||||||
|
ld1b {z18.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
add v0.8h, v0.8h, v21.8h
|
||||||
|
mul v27.8h, v21.8h, v21.8h
|
||||||
|
uadalp v1.4s, v26.8h
|
||||||
|
ld1b {z19.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
uadalp v2.4s, v27.8h
|
||||||
|
b.gt 1b
|
||||||
|
|
||||||
|
add v0.8h, v0.8h, v18.8h
|
||||||
|
mul v28.8h, v18.8h, v18.8h
|
||||||
|
add v0.8h, v0.8h, v19.8h
|
||||||
|
mul v29.8h, v19.8h, v19.8h
|
||||||
|
uadalp v1.4s, v28.8h
|
||||||
|
uadalp v2.4s, v29.8h
|
||||||
|
|
||||||
|
b var_end
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function var_end
|
||||||
|
add v1.4s, v1.4s, v2.4s
|
||||||
|
uaddlv s0, v0.8h
|
||||||
|
uaddlv d1, v1.4s
|
||||||
|
mov w0, v0.s[0]
|
||||||
|
mov x1, v1.d[0]
|
||||||
|
orr x0, x0, x1, lsl #32
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro SUMSUBL_AB_SVE sum, sub, a, b
|
||||||
|
add \sum, \a, \b
|
||||||
|
sub \sub, \a, \b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function pixel_sa8d_8x8_sve, export=1
|
||||||
|
ptrue p0.h, vl8
|
||||||
|
mov x4, x30
|
||||||
|
bl pixel_sa8d_8x8_sve
|
||||||
|
add v0.8h, v0.8h, v1.8h
|
||||||
|
uaddlv s0, v0.8h
|
||||||
|
mov w0, v0.s[0]
|
||||||
|
add w0, w0, #1
|
||||||
|
lsr w0, w0, #1
|
||||||
|
ret x4
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro sa8d_satd_sve_8x8 satd=
|
||||||
|
function pixel_sa8d_\satd\()8x8_sve
|
||||||
|
load_diff_fly_sve_8x8
|
||||||
|
|
||||||
|
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
|
||||||
|
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
|
||||||
|
|
||||||
|
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
|
||||||
|
.ifc \satd, satd_
|
||||||
|
transpose v0.8h, v1.8h, v16.8h, v17.8h
|
||||||
|
transpose v2.8h, v3.8h, v18.8h, v19.8h
|
||||||
|
transpose v4.8h, v5.8h, v20.8h, v21.8h
|
||||||
|
transpose v6.8h, v7.8h, v22.8h, v23.8h
|
||||||
|
|
||||||
|
SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
|
||||||
|
SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
|
||||||
|
SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
|
||||||
|
|
||||||
|
transpose v4.4s, v6.4s, v24.4s, v26.4s
|
||||||
|
transpose v5.4s, v7.4s, v25.4s, v27.4s
|
||||||
|
transpose v24.4s, v26.4s, v0.4s, v2.4s
|
||||||
|
transpose v25.4s, v27.4s, v1.4s, v3.4s
|
||||||
|
|
||||||
|
abs v0.8h, v4.8h
|
||||||
|
abs v1.8h, v5.8h
|
||||||
|
abs v2.8h, v6.8h
|
||||||
|
abs v3.8h, v7.8h
|
||||||
|
abs v4.8h, v24.8h
|
||||||
|
abs v5.8h, v25.8h
|
||||||
|
abs v6.8h, v26.8h
|
||||||
|
abs v7.8h, v27.8h
|
||||||
|
|
||||||
|
umax v0.8h, v0.8h, v2.8h
|
||||||
|
umax v1.8h, v1.8h, v3.8h
|
||||||
|
umax v2.8h, v4.8h, v6.8h
|
||||||
|
umax v3.8h, v5.8h, v7.8h
|
||||||
|
|
||||||
|
add v26.8h, v0.8h, v1.8h
|
||||||
|
add v27.8h, v2.8h, v3.8h
|
||||||
|
.endif
|
||||||
|
|
||||||
|
SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
|
||||||
|
SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
|
||||||
|
SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
|
||||||
|
SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
|
||||||
|
|
||||||
|
transpose v20.8h, v21.8h, v16.8h, v17.8h
|
||||||
|
transpose v4.8h, v5.8h, v0.8h, v1.8h
|
||||||
|
transpose v22.8h, v23.8h, v18.8h, v19.8h
|
||||||
|
transpose v6.8h, v7.8h, v2.8h, v3.8h
|
||||||
|
|
||||||
|
SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
|
||||||
|
SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
|
||||||
|
SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
|
||||||
|
SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
|
||||||
|
|
||||||
|
transpose v20.4s, v22.4s, v2.4s, v0.4s
|
||||||
|
transpose v21.4s, v23.4s, v3.4s, v1.4s
|
||||||
|
transpose v16.4s, v18.4s, v24.4s, v4.4s
|
||||||
|
transpose v17.4s, v19.4s, v25.4s, v5.4s
|
||||||
|
|
||||||
|
SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
|
||||||
|
SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
|
||||||
|
SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
|
||||||
|
SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
|
||||||
|
|
||||||
|
transpose v16.2d, v20.2d, v0.2d, v4.2d
|
||||||
|
transpose v17.2d, v21.2d, v1.2d, v5.2d
|
||||||
|
transpose v18.2d, v22.2d, v2.2d, v6.2d
|
||||||
|
transpose v19.2d, v23.2d, v3.2d, v7.2d
|
||||||
|
|
||||||
|
abs v16.8h, v16.8h
|
||||||
|
abs v20.8h, v20.8h
|
||||||
|
abs v17.8h, v17.8h
|
||||||
|
abs v21.8h, v21.8h
|
||||||
|
abs v18.8h, v18.8h
|
||||||
|
abs v22.8h, v22.8h
|
||||||
|
abs v19.8h, v19.8h
|
||||||
|
abs v23.8h, v23.8h
|
||||||
|
|
||||||
|
umax v16.8h, v16.8h, v20.8h
|
||||||
|
umax v17.8h, v17.8h, v21.8h
|
||||||
|
umax v18.8h, v18.8h, v22.8h
|
||||||
|
umax v19.8h, v19.8h, v23.8h
|
||||||
|
|
||||||
|
add v0.8h, v16.8h, v17.8h
|
||||||
|
add v1.8h, v18.8h, v19.8h
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro HADAMARD_AC_SVE w h
|
||||||
|
function pixel_hadamard_ac_\w\()x\h\()_sve, export=1
|
||||||
|
ptrue p0.h, vl8
|
||||||
|
movrel x5, mask_ac_4_8
|
||||||
|
mov x4, x30
|
||||||
|
ld1 {v30.8h,v31.8h}, [x5]
|
||||||
|
movi v28.16b, #0
|
||||||
|
movi v29.16b, #0
|
||||||
|
|
||||||
|
bl hadamard_ac_8x8_sve
|
||||||
|
.if \h > 8
|
||||||
|
bl hadamard_ac_8x8_sve
|
||||||
|
.endif
|
||||||
|
.if \w > 8
|
||||||
|
sub x0, x0, x1, lsl #3
|
||||||
|
add x0, x0, #8
|
||||||
|
bl hadamard_ac_8x8_sve
|
||||||
|
.endif
|
||||||
|
.if \w * \h == 256
|
||||||
|
sub x0, x0, x1, lsl #4
|
||||||
|
bl hadamard_ac_8x8_sve
|
||||||
|
.endif
|
||||||
|
|
||||||
|
addv s1, v29.4s
|
||||||
|
addv s0, v28.4s
|
||||||
|
mov w1, v1.s[0]
|
||||||
|
mov w0, v0.s[0]
|
||||||
|
lsr w1, w1, #2
|
||||||
|
lsr w0, w0, #1
|
||||||
|
orr x0, x0, x1, lsl #32
|
||||||
|
ret x4
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
|
||||||
|
function hadamard_ac_8x8_sve
|
||||||
|
ld1b {z16.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z17.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z18.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z19.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
SUMSUBL_AB_SVE v0.8h, v1.8h, v16.8h, v17.8h
|
||||||
|
ld1b {z20.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z21.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
SUMSUBL_AB_SVE v2.8h, v3.8h, v18.8h, v19.8h
|
||||||
|
ld1b {z22.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
ld1b {z23.h}, p0/z, [x0]
|
||||||
|
add x0, x0, x1
|
||||||
|
SUMSUBL_AB_SVE v4.8h, v5.8h, v20.8h, v21.8h
|
||||||
|
SUMSUBL_AB_SVE v6.8h, v7.8h, v22.8h, v23.8h
|
||||||
|
|
||||||
|
SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
|
||||||
|
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
|
||||||
|
|
||||||
|
transpose v0.8h, v1.8h, v16.8h, v17.8h
|
||||||
|
transpose v2.8h, v3.8h, v18.8h, v19.8h
|
||||||
|
transpose v4.8h, v5.8h, v20.8h, v21.8h
|
||||||
|
transpose v6.8h, v7.8h, v22.8h, v23.8h
|
||||||
|
|
||||||
|
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
|
||||||
|
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
|
||||||
|
SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
|
||||||
|
SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
|
||||||
|
|
||||||
|
transpose v0.4s, v2.4s, v16.4s, v18.4s
|
||||||
|
transpose v1.4s, v3.4s, v17.4s, v19.4s
|
||||||
|
transpose v4.4s, v6.4s, v20.4s, v22.4s
|
||||||
|
transpose v5.4s, v7.4s, v21.4s, v23.4s
|
||||||
|
|
||||||
|
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
|
||||||
|
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
|
||||||
|
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
|
||||||
|
|
||||||
|
abs v0.8h, v16.8h
|
||||||
|
abs v4.8h, v20.8h
|
||||||
|
abs v1.8h, v17.8h
|
||||||
|
abs v5.8h, v21.8h
|
||||||
|
abs v2.8h, v18.8h
|
||||||
|
abs v6.8h, v22.8h
|
||||||
|
abs v3.8h, v19.8h
|
||||||
|
abs v7.8h, v23.8h
|
||||||
|
|
||||||
|
add v0.8h, v0.8h, v4.8h
|
||||||
|
add v1.8h, v1.8h, v5.8h
|
||||||
|
and v0.16b, v0.16b, v30.16b
|
||||||
|
add v2.8h, v2.8h, v6.8h
|
||||||
|
add v3.8h, v3.8h, v7.8h
|
||||||
|
add v0.8h, v0.8h, v2.8h
|
||||||
|
add v1.8h, v1.8h, v3.8h
|
||||||
|
uadalp v28.4s, v0.8h
|
||||||
|
uadalp v28.4s, v1.8h
|
||||||
|
|
||||||
|
SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
|
||||||
|
SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
|
||||||
|
SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
|
||||||
|
SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
|
||||||
|
|
||||||
|
transpose v16.2d, v17.2d, v6.2d, v7.2d
|
||||||
|
transpose v18.2d, v19.2d, v4.2d, v5.2d
|
||||||
|
transpose v20.2d, v21.2d, v2.2d, v3.2d
|
||||||
|
|
||||||
|
abs v16.8h, v16.8h
|
||||||
|
abs v17.8h, v17.8h
|
||||||
|
abs v18.8h, v18.8h
|
||||||
|
abs v19.8h, v19.8h
|
||||||
|
abs v20.8h, v20.8h
|
||||||
|
abs v21.8h, v21.8h
|
||||||
|
|
||||||
|
transpose v7.2d, v6.2d, v1.2d, v0.2d
|
||||||
|
|
||||||
|
umax v3.8h, v16.8h, v17.8h
|
||||||
|
umax v2.8h, v18.8h, v19.8h
|
||||||
|
umax v1.8h, v20.8h, v21.8h
|
||||||
|
|
||||||
|
SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
|
||||||
|
|
||||||
|
add v2.8h, v2.8h, v3.8h
|
||||||
|
add v2.8h, v2.8h, v1.8h
|
||||||
|
and v4.16b, v4.16b, v31.16b
|
||||||
|
add v2.8h, v2.8h, v2.8h
|
||||||
|
abs v5.8h, v5.8h
|
||||||
|
abs v4.8h, v4.8h
|
||||||
|
add v2.8h, v2.8h, v5.8h
|
||||||
|
add v2.8h, v2.8h, v4.8h
|
||||||
|
uadalp v29.4s, v2.8h
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
SSD_FUNC_SVE 4, 4
|
||||||
|
SSD_FUNC_SVE 4, 8
|
||||||
|
SSD_FUNC_SVE 4, 16
|
||||||
|
SSD_FUNC_SVE 8, 4
|
||||||
|
SSD_FUNC_SVE 8, 8
|
||||||
|
|
||||||
|
pixel_var_sve_8 8
|
||||||
|
pixel_var_sve_8 16
|
||||||
|
|
||||||
|
sa8d_satd_sve_8x8
|
||||||
|
|
||||||
|
HADAMARD_AC_SVE 8, 8
|
||||||
|
HADAMARD_AC_SVE 8, 16
|
||||||
|
HADAMARD_AC_SVE 16, 8
|
||||||
|
HADAMARD_AC_SVE 16, 16
|
||||||
|
|
||||||
|
#else /* BIT_DEPTH == 10 */
|
||||||
|
|
||||||
|
.macro SSD_START_SVE_4
|
||||||
|
ptrue p0.s, vl4
|
||||||
|
ld1h {z16.s}, p0/z, [x0]
|
||||||
|
ld1h {z17.s}, p0/z, [x2]
|
||||||
|
add x0, x0, x1, lsl #1
|
||||||
|
add x2, x2, x3, lsl #1
|
||||||
|
sub v2.4s, v16.4s, v17.4s
|
||||||
|
ld1h {z16.s}, p0/z, [x0]
|
||||||
|
ld1h {z17.s}, p0/z, [x2]
|
||||||
|
add x0, x0, x1, lsl #1
|
||||||
|
add x2, x2, x3, lsl #1
|
||||||
|
mul v0.4s, v2.4s, v2.4s
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_SVE_4
|
||||||
|
sub v2.4s, v16.4s, v17.4s
|
||||||
|
ld1h {z16.s}, p0/z, [x0]
|
||||||
|
ld1h {z17.s}, p0/z, [x2]
|
||||||
|
add x0, x0, x1, lsl #1
|
||||||
|
add x2, x2, x3, lsl #1
|
||||||
|
mla v0.4s, v2.4s, v2.4s
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_END_SVE_4
|
||||||
|
sub v2.4s, v16.4s, v17.4s
|
||||||
|
mla v0.4s, v2.4s, v2.4s
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SSD_FUNC_SVE w h
|
||||||
|
function pixel_ssd_\w\()x\h\()_sve, export=1
|
||||||
|
SSD_START_SVE_\w
|
||||||
|
.rept \h-2
|
||||||
|
SSD_SVE_\w
|
||||||
|
.endr
|
||||||
|
SSD_END_SVE_\w
|
||||||
|
|
||||||
|
addv s0, v0.4s
|
||||||
|
fmov w0, s0
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
SSD_FUNC_SVE 4, 4
|
||||||
|
SSD_FUNC_SVE 4, 8
|
||||||
|
SSD_FUNC_SVE 4, 16
|
||||||
|
|
||||||
|
#endif /* BIT_DEPTH == 8 */
|
||||||
3040
common/aarch64/pixel-a.S
Normal file
3040
common/aarch64/pixel-a.S
Normal file
File diff suppressed because it is too large
Load Diff
191
common/aarch64/pixel.h
Normal file
191
common/aarch64/pixel.h
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* pixel.h: aarch64 pixel metrics
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_PIXEL_H
|
||||||
|
#define X264_AARCH64_PIXEL_H
|
||||||
|
|
||||||
|
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
|
||||||
|
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
|
||||||
|
#define x264_pixel_sad_4x16_neon x264_template(pixel_sad_4x16_neon)
|
||||||
|
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
|
||||||
|
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
|
||||||
|
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
|
||||||
|
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
|
||||||
|
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
|
||||||
|
|
||||||
|
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
|
||||||
|
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
|
||||||
|
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
|
||||||
|
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
|
||||||
|
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
|
||||||
|
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
|
||||||
|
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
|
||||||
|
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
|
||||||
|
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
|
||||||
|
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
|
||||||
|
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
|
||||||
|
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
|
||||||
|
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
|
||||||
|
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
|
||||||
|
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
|
||||||
|
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
|
||||||
|
#define x264_pixel_satd_4x16_neon x264_template(pixel_satd_4x16_neon)
|
||||||
|
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
|
||||||
|
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
|
||||||
|
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
|
||||||
|
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
|
||||||
|
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
|
||||||
|
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
|
||||||
|
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
|
||||||
|
#define x264_pixel_ssd_4x16_neon x264_template(pixel_ssd_4x16_neon)
|
||||||
|
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
|
||||||
|
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
|
||||||
|
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
|
||||||
|
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
|
||||||
|
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
|
||||||
|
|
||||||
|
#if HAVE_DOTPROD
|
||||||
|
#define x264_pixel_sad_16x8_neon_dotprod x264_template(pixel_sad_16x8_neon_dotprod)
|
||||||
|
#define x264_pixel_sad_16x16_neon_dotprod x264_template(pixel_sad_16x16_neon_dotprod)
|
||||||
|
#define x264_pixel_sad_x3_16x16_neon_dotprod x264_template(pixel_sad_x3_16x16_neon_dotprod)
|
||||||
|
#define x264_pixel_sad_x3_16x8_neon_dotprod x264_template(pixel_sad_x3_16x8_neon_dotprod)
|
||||||
|
#define x264_pixel_sad_x4_16x16_neon_dotprod x264_template(pixel_sad_x4_16x16_neon_dotprod)
|
||||||
|
#define x264_pixel_sad_x4_16x8_neon_dotprod x264_template(pixel_sad_x4_16x8_neon_dotprod)
|
||||||
|
|
||||||
|
#define x264_pixel_ssd_16x16_neon_dotprod x264_template(pixel_ssd_16x16_neon_dotprod)
|
||||||
|
#define x264_pixel_ssd_16x8_neon_dotprod x264_template(pixel_ssd_16x8_neon_dotprod)
|
||||||
|
#define x264_pixel_ssd_8x16_neon_dotprod x264_template(pixel_ssd_8x16_neon_dotprod)
|
||||||
|
#define x264_pixel_ssd_8x4_neon_dotprod x264_template(pixel_ssd_8x4_neon_dotprod)
|
||||||
|
#define x264_pixel_ssd_8x8_neon_dotprod x264_template(pixel_ssd_8x8_neon_dotprod)
|
||||||
|
#endif // HAVE_DOTPROD
|
||||||
|
|
||||||
|
#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
|
||||||
|
#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
|
||||||
|
#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
|
||||||
|
#define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve)
|
||||||
|
#define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve)
|
||||||
|
#define DECL_PIXELS( ret, name, suffix, args ) \
|
||||||
|
ret x264_pixel_##name##_16x16_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_16x8_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_8x16_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_8x8_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_8x4_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_4x16_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_4x8_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_4x4_##suffix args;
|
||||||
|
#define DECL_PIXELS_SSD_SVE( ret, args ) \
|
||||||
|
ret x264_pixel_ssd_8x8_sve args;\
|
||||||
|
ret x264_pixel_ssd_8x4_sve args;\
|
||||||
|
ret x264_pixel_ssd_4x16_sve args;\
|
||||||
|
ret x264_pixel_ssd_4x8_sve args;\
|
||||||
|
ret x264_pixel_ssd_4x4_sve args;
|
||||||
|
|
||||||
|
#define DECL_X1( name, suffix ) \
|
||||||
|
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
|
||||||
|
#define DECL_X1_SSD_SVE( ) \
|
||||||
|
DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) )
|
||||||
|
|
||||||
|
#define DECL_X4( name, suffix ) \
|
||||||
|
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
|
||||||
|
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
|
||||||
|
|
||||||
|
DECL_X1( sad, neon )
|
||||||
|
DECL_X4( sad, neon )
|
||||||
|
DECL_X1( satd, neon )
|
||||||
|
DECL_X1( ssd, neon )
|
||||||
|
DECL_X1_SSD_SVE( )
|
||||||
|
|
||||||
|
#if HAVE_DOTPROD
|
||||||
|
DECL_X1( sad, neon_dotprod )
|
||||||
|
DECL_X4( sad, neon_dotprod )
|
||||||
|
DECL_X1( ssd, neon_dotprod )
|
||||||
|
#endif // HAVE_DOTPROD
|
||||||
|
|
||||||
|
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
|
||||||
|
void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * );
|
||||||
|
|
||||||
|
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
|
||||||
|
int x264_pixel_vsad_neon( pixel *, intptr_t, int );
|
||||||
|
|
||||||
|
#if HAVE_DOTPROD
|
||||||
|
#define x264_pixel_vsad_neon_dotprod x264_template(pixel_vsad_neon_dotprod)
|
||||||
|
int x264_pixel_vsad_neon_dotprod( pixel *, intptr_t, int );
|
||||||
|
#endif // HAVE_DOTPROD
|
||||||
|
|
||||||
|
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
|
||||||
|
int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t );
|
||||||
|
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
|
||||||
|
int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
|
||||||
|
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
|
||||||
|
uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
|
||||||
|
#define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve)
|
||||||
|
int x264_pixel_sa8d_8x8_sve ( pixel *, intptr_t, pixel *, intptr_t );
|
||||||
|
|
||||||
|
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
|
||||||
|
uint64_t x264_pixel_var_8x8_neon ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
|
||||||
|
uint64_t x264_pixel_var_8x16_neon ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
|
||||||
|
uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
|
||||||
|
int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * );
|
||||||
|
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
|
||||||
|
int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * );
|
||||||
|
#define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve)
|
||||||
|
uint64_t x264_pixel_var_8x8_sve ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve)
|
||||||
|
uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t );
|
||||||
|
|
||||||
|
|
||||||
|
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_neon ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_sve ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t );
|
||||||
|
|
||||||
|
|
||||||
|
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
|
||||||
|
void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t,
|
||||||
|
const pixel *, intptr_t,
|
||||||
|
int sums[2][4] );
|
||||||
|
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
|
||||||
|
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
|
||||||
|
|
||||||
|
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
|
||||||
|
int x264_pixel_asd8_neon( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
|
||||||
|
#endif
|
||||||
908
common/aarch64/predict-a.S
Normal file
908
common/aarch64/predict-a.S
Normal file
@@ -0,0 +1,908 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict.S: aarch64 intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Mans Rullgard <mans@mansr.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
const p8weight, align=4
|
||||||
|
.short 1, 2, 3, 4, 1, 2, 3, 4
|
||||||
|
endconst
|
||||||
|
const p16weight, align=4
|
||||||
|
.short 1, 2, 3, 4, 5, 6, 7, 8
|
||||||
|
endconst
|
||||||
|
|
||||||
|
.macro ldcol.8 vd, xn, xm, n=8, hi=0
|
||||||
|
.if \n == 8 || \hi == 0
|
||||||
|
ld1 {\vd\().b}[0], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[1], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[2], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[3], [\xn], \xm
|
||||||
|
.endif
|
||||||
|
.if \n == 8 || \hi == 1
|
||||||
|
ld1 {\vd\().b}[4], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[5], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[6], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[7], [\xn], \xm
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ldcol.16 vd, xn, xm
|
||||||
|
ldcol.8 \vd, \xn, \xm
|
||||||
|
ld1 {\vd\().b}[ 8], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[ 9], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[10], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[11], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[12], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[13], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[14], [\xn], \xm
|
||||||
|
ld1 {\vd\().b}[15], [\xn], \xm
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
function predict_4x4_h_aarch64, export=1
|
||||||
|
ldurb w1, [x0, #0*FDEC_STRIDE-1]
|
||||||
|
mov w5, #0x01010101
|
||||||
|
ldrb w2, [x0, #1*FDEC_STRIDE-1]
|
||||||
|
ldrb w3, [x0, #2*FDEC_STRIDE-1]
|
||||||
|
mul w1, w1, w5
|
||||||
|
ldrb w4, [x0, #3*FDEC_STRIDE-1]
|
||||||
|
mul w2, w2, w5
|
||||||
|
str w1, [x0, #0*FDEC_STRIDE]
|
||||||
|
mul w3, w3, w5
|
||||||
|
str w2, [x0, #1*FDEC_STRIDE]
|
||||||
|
mul w4, w4, w5
|
||||||
|
str w3, [x0, #2*FDEC_STRIDE]
|
||||||
|
str w4, [x0, #3*FDEC_STRIDE]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_v_aarch64, export=1
|
||||||
|
ldur w1, [x0, #0 - 1 * FDEC_STRIDE]
|
||||||
|
str w1, [x0, #0 + 0 * FDEC_STRIDE]
|
||||||
|
str w1, [x0, #0 + 1 * FDEC_STRIDE]
|
||||||
|
str w1, [x0, #0 + 2 * FDEC_STRIDE]
|
||||||
|
str w1, [x0, #0 + 3 * FDEC_STRIDE]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_dc_neon, export=1
|
||||||
|
sub x1, x0, #FDEC_STRIDE
|
||||||
|
ldurb w4, [x0, #-1 + 0 * FDEC_STRIDE]
|
||||||
|
ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
|
||||||
|
ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
|
||||||
|
ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
|
||||||
|
add w4, w4, w5
|
||||||
|
ldr s0, [x1]
|
||||||
|
add w6, w6, w7
|
||||||
|
uaddlv h0, v0.8b
|
||||||
|
add w4, w4, w6
|
||||||
|
dup v0.4h, v0.h[0]
|
||||||
|
dup v1.4h, w4
|
||||||
|
add v0.4h, v0.4h, v1.4h
|
||||||
|
rshrn v0.8b, v0.8h, #3
|
||||||
|
str s0, [x0]
|
||||||
|
str s0, [x0, #1 * FDEC_STRIDE]
|
||||||
|
str s0, [x0, #2 * FDEC_STRIDE]
|
||||||
|
str s0, [x0, #3 * FDEC_STRIDE]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_dc_top_neon, export=1
|
||||||
|
sub x1, x0, #FDEC_STRIDE
|
||||||
|
ldr s0, [x1]
|
||||||
|
uaddlv h0, v0.8b
|
||||||
|
dup v0.4h, v0.h[0]
|
||||||
|
rshrn v0.8b, v0.8h, #2
|
||||||
|
str s0, [x0]
|
||||||
|
str s0, [x0, #1 * FDEC_STRIDE]
|
||||||
|
str s0, [x0, #2 * FDEC_STRIDE]
|
||||||
|
str s0, [x0, #3 * FDEC_STRIDE]
|
||||||
|
ret
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_ddr_neon, export=1
|
||||||
|
sub x1, x0, #FDEC_STRIDE+1
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
|
||||||
|
ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
|
||||||
|
ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
|
||||||
|
ext v0.8b, v1.8b, v0.8b, #7
|
||||||
|
ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
|
||||||
|
ext v0.8b, v2.8b, v0.8b, #7 // a
|
||||||
|
ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
|
||||||
|
ext v1.8b, v3.8b, v0.8b, #7 // b
|
||||||
|
ext v2.8b, v4.8b, v1.8b, #7 // c
|
||||||
|
uaddl v0.8h, v0.8b, v1.8b
|
||||||
|
uaddl v1.8h, v1.8b, v2.8b
|
||||||
|
add v0.8h, v0.8h, v1.8h
|
||||||
|
rshrn v0.8b, v0.8h, #2
|
||||||
|
|
||||||
|
ext v3.8b, v0.8b, v0.8b, #3
|
||||||
|
ext v2.8b, v0.8b, v0.8b, #2
|
||||||
|
ext v1.8b, v0.8b, v0.8b, #1
|
||||||
|
|
||||||
|
str s3, [x0], #FDEC_STRIDE
|
||||||
|
str s2, [x0], #FDEC_STRIDE
|
||||||
|
str s1, [x0], #FDEC_STRIDE
|
||||||
|
str s0, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_ddl_neon, export=1
|
||||||
|
sub x0, x0, #FDEC_STRIDE
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v0.8b}, [x0], x7
|
||||||
|
dup v3.8b, v0.b[7]
|
||||||
|
ext v1.8b, v0.8b, v0.8b, #1
|
||||||
|
ext v2.8b, v0.8b, v3.8b, #2
|
||||||
|
uhadd v0.8b, v0.8b, v2.8b
|
||||||
|
urhadd v0.8b, v0.8b, v1.8b
|
||||||
|
str s0, [x0], #FDEC_STRIDE
|
||||||
|
ext v1.8b, v0.8b, v0.8b, #1
|
||||||
|
ext v2.8b, v0.8b, v0.8b, #2
|
||||||
|
str s1, [x0], #FDEC_STRIDE
|
||||||
|
ext v3.8b, v0.8b, v0.8b, #3
|
||||||
|
str s2, [x0], #FDEC_STRIDE
|
||||||
|
str s3, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_dc_neon, export=1
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v0.16b}, [x1], #16
|
||||||
|
ld1 {v1.8b}, [x1]
|
||||||
|
ext v0.16b, v0.16b, v0.16b, #7
|
||||||
|
uaddlv h1, v1.8b
|
||||||
|
uaddlv h0, v0.8b
|
||||||
|
add v0.8h, v0.8h, v1.8h
|
||||||
|
dup v0.8h, v0.h[0]
|
||||||
|
rshrn v0.8b, v0.8h, #4
|
||||||
|
.rept 8
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_h_neon, export=1
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v16.16b}, [x1]
|
||||||
|
dup v0.8b, v16.b[14]
|
||||||
|
dup v1.8b, v16.b[13]
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
dup v2.8b, v16.b[12]
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
dup v3.8b, v16.b[11]
|
||||||
|
st1 {v2.8b}, [x0], x7
|
||||||
|
dup v4.8b, v16.b[10]
|
||||||
|
st1 {v3.8b}, [x0], x7
|
||||||
|
dup v5.8b, v16.b[9]
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
dup v6.8b, v16.b[8]
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
dup v7.8b, v16.b[7]
|
||||||
|
st1 {v6.8b}, [x0], x7
|
||||||
|
st1 {v7.8b}, [x0], x7
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_v_neon, export=1
|
||||||
|
add x1, x1, #16
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v0.8b}, [x1]
|
||||||
|
.rept 8
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_ddl_neon, export=1
|
||||||
|
add x1, x1, #16
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v0.16b}, [x1]
|
||||||
|
movi v3.16b, #0
|
||||||
|
dup v2.16b, v0.b[15]
|
||||||
|
ext v4.16b, v3.16b, v0.16b, #15
|
||||||
|
ext v2.16b, v0.16b, v2.16b, #1
|
||||||
|
uhadd v4.16b, v4.16b, v2.16b
|
||||||
|
urhadd v0.16b, v0.16b, v4.16b
|
||||||
|
ext v1.16b, v0.16b, v0.16b, #1
|
||||||
|
ext v2.16b, v0.16b, v0.16b, #2
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
ext v3.16b, v0.16b, v0.16b, #3
|
||||||
|
st1 {v2.8b}, [x0], x7
|
||||||
|
ext v4.16b, v0.16b, v0.16b, #4
|
||||||
|
st1 {v3.8b}, [x0], x7
|
||||||
|
ext v5.16b, v0.16b, v0.16b, #5
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
ext v6.16b, v0.16b, v0.16b, #6
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
ext v7.16b, v0.16b, v0.16b, #7
|
||||||
|
st1 {v6.8b}, [x0], x7
|
||||||
|
ext v0.16b, v0.16b, v0.16b, #8
|
||||||
|
st1 {v7.8b}, [x0], x7
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_ddr_neon, export=1
|
||||||
|
ld1 {v0.16b,v1.16b}, [x1]
|
||||||
|
ext v2.16b, v0.16b, v1.16b, #7
|
||||||
|
ext v4.16b, v0.16b, v1.16b, #9
|
||||||
|
ext v3.16b, v0.16b, v1.16b, #8
|
||||||
|
|
||||||
|
uhadd v2.16b, v2.16b, v4.16b
|
||||||
|
urhadd v7.16b, v3.16b, v2.16b
|
||||||
|
|
||||||
|
add x0, x0, #7*FDEC_STRIDE
|
||||||
|
mov x7, #-1*FDEC_STRIDE
|
||||||
|
|
||||||
|
ext v6.16b, v7.16b, v7.16b, #1
|
||||||
|
st1 {v7.8b}, [x0], x7
|
||||||
|
ext v5.16b, v7.16b, v7.16b, #2
|
||||||
|
st1 {v6.8b}, [x0], x7
|
||||||
|
ext v4.16b, v7.16b, v7.16b, #3
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
ext v3.16b, v7.16b, v7.16b, #4
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
ext v2.16b, v7.16b, v7.16b, #5
|
||||||
|
st1 {v3.8b}, [x0], x7
|
||||||
|
ext v1.16b, v7.16b, v7.16b, #6
|
||||||
|
st1 {v2.8b}, [x0], x7
|
||||||
|
ext v0.16b, v7.16b, v7.16b, #7
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_vl_neon, export=1
|
||||||
|
add x1, x1, #16
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
|
||||||
|
ld1 {v0.16b}, [x1]
|
||||||
|
ext v1.16b, v1.16b, v0.16b, #15
|
||||||
|
ext v2.16b, v0.16b, v2.16b, #1
|
||||||
|
|
||||||
|
uhadd v1.16b, v1.16b, v2.16b
|
||||||
|
urhadd v3.16b, v0.16b, v2.16b
|
||||||
|
|
||||||
|
urhadd v0.16b, v0.16b, v1.16b
|
||||||
|
|
||||||
|
ext v4.16b, v0.16b, v0.16b, #1
|
||||||
|
st1 {v3.8b}, [x0], x7
|
||||||
|
ext v5.16b, v3.16b, v3.16b, #1
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
ext v6.16b, v0.16b, v0.16b, #2
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
ext v7.16b, v3.16b, v3.16b, #2
|
||||||
|
st1 {v6.8b}, [x0], x7
|
||||||
|
ext v4.16b, v0.16b, v0.16b, #3
|
||||||
|
st1 {v7.8b}, [x0], x7
|
||||||
|
ext v5.16b, v3.16b, v3.16b, #3
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
ext v6.16b, v0.16b, v0.16b, #4
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
st1 {v6.8b}, [x0], x7
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_vr_neon, export=1
|
||||||
|
add x1, x1, #8
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v2.16b}, [x1]
|
||||||
|
|
||||||
|
ext v1.16b, v2.16b, v2.16b, #14
|
||||||
|
ext v0.16b, v2.16b, v2.16b, #15
|
||||||
|
|
||||||
|
uhadd v3.16b, v2.16b, v1.16b
|
||||||
|
urhadd v2.16b, v2.16b, v0.16b
|
||||||
|
urhadd v0.16b, v0.16b, v3.16b
|
||||||
|
|
||||||
|
ext v1.16b, v2.16b, v2.16b, #8
|
||||||
|
uzp1 v2.8b, v0.8b, v0.8b
|
||||||
|
uzp2 v3.8b, v0.8b, v0.8b
|
||||||
|
ext v0.16b, v0.16b, v0.16b, #8
|
||||||
|
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
ext v4.8b, v3.8b, v1.8b, #7
|
||||||
|
ext v5.8b, v2.8b, v0.8b, #7
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
ext v6.8b, v3.8b, v1.8b, #6
|
||||||
|
ext v7.8b, v2.8b, v0.8b, #6
|
||||||
|
st1 {v6.8b}, [x0], x7
|
||||||
|
st1 {v7.8b}, [x0], x7
|
||||||
|
ext v1.8b, v3.8b, v1.8b, #5
|
||||||
|
ext v0.8b, v2.8b, v0.8b, #5
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_hd_neon, export=1
|
||||||
|
add x1, x1, #7
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
|
||||||
|
ld1 {v1.16b}, [x1]
|
||||||
|
ext v3.16b, v1.16b, v1.16b, #1
|
||||||
|
ext v2.16b, v1.16b, v1.16b, #2
|
||||||
|
|
||||||
|
urhadd v4.16b, v1.16b, v3.16b
|
||||||
|
|
||||||
|
uhadd v1.16b, v1.16b, v2.16b
|
||||||
|
urhadd v0.16b, v1.16b, v3.16b
|
||||||
|
|
||||||
|
zip1 v16.8b, v4.8b, v0.8b
|
||||||
|
zip2 v17.8b, v4.8b, v0.8b
|
||||||
|
ext v7.16b, v0.16b, v0.16b, #8
|
||||||
|
|
||||||
|
ext v0.8b, v17.8b, v7.8b, #6
|
||||||
|
ext v1.8b, v17.8b, v7.8b, #4
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
ext v2.8b, v17.8b, v7.8b, #2
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
st1 {v2.8b}, [x0], x7
|
||||||
|
ext v3.8b, v16.8b, v17.8b, #6
|
||||||
|
st1 {v17.8b}, [x0], x7
|
||||||
|
ext v4.8b, v16.8b, v17.8b, #4
|
||||||
|
st1 {v3.8b}, [x0], x7
|
||||||
|
ext v5.8b, v16.8b, v17.8b, #2
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
st1 {v16.8b}, [x0], x7
|
||||||
|
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_hu_neon, export=1
|
||||||
|
add x1, x1, #7
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v7.8b}, [x1]
|
||||||
|
dup v6.8b, v7.b[0]
|
||||||
|
rev64 v7.8b, v7.8b
|
||||||
|
|
||||||
|
ext v4.8b, v7.8b, v6.8b, #2
|
||||||
|
ext v2.8b, v7.8b, v6.8b, #1
|
||||||
|
|
||||||
|
uhadd v5.8b, v7.8b, v4.8b
|
||||||
|
urhadd v0.8b, v2.8b, v7.8b
|
||||||
|
urhadd v1.8b, v5.8b, v2.8b
|
||||||
|
|
||||||
|
zip1 v16.8b, v0.8b, v1.8b
|
||||||
|
zip2 v17.8b, v0.8b, v1.8b
|
||||||
|
|
||||||
|
dup v18.4h, v17.h[3]
|
||||||
|
|
||||||
|
ext v0.8b, v16.8b, v17.8b, #2
|
||||||
|
ext v1.8b, v16.8b, v17.8b, #4
|
||||||
|
ext v2.8b, v16.8b, v17.8b, #6
|
||||||
|
st1 {v16.8b}, [x0], x7
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
st1 {v2.8b}, [x0], x7
|
||||||
|
|
||||||
|
ext v4.8b, v17.8b, v18.8b, #2
|
||||||
|
ext v5.8b, v17.8b, v18.8b, #4
|
||||||
|
ext v6.8b, v17.8b, v18.8b, #6
|
||||||
|
st1 {v17.8b}, [x0], x7
|
||||||
|
st1 {v4.8b}, [x0], x7
|
||||||
|
st1 {v5.8b}, [x0], x7
|
||||||
|
st1 {v6.8b}, [x0]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
function predict_8x8c_dc_top_neon, export=1
|
||||||
|
sub x2, x0, #FDEC_STRIDE
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
ld1 {v0.8b}, [x2]
|
||||||
|
uaddlp v0.4h, v0.8b
|
||||||
|
addp v0.4h, v0.4h, v0.4h
|
||||||
|
rshrn v0.8b, v0.8h, #2
|
||||||
|
dup v3.8b, v0.b[1]
|
||||||
|
dup v2.8b, v0.b[0]
|
||||||
|
transpose v0.2s, v1.2s, v2.2s, v3.2s
|
||||||
|
b pred8x8c_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_dc_left_neon, export=1
|
||||||
|
ldurb w2, [x0, #0 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
add w2, w2, w3
|
||||||
|
add w3, w4, w5
|
||||||
|
ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
|
||||||
|
add w6, w6, w7
|
||||||
|
add w7, w8, w9
|
||||||
|
add w2, w2, w3
|
||||||
|
add w6, w6, w7
|
||||||
|
dup v0.8h, w2
|
||||||
|
dup v1.8h, w6
|
||||||
|
rshrn v0.8b, v0.8h, #2
|
||||||
|
rshrn v1.8b, v1.8h, #2
|
||||||
|
b pred8x8c_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_dc_neon, export=1
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
sub x2, x0, #FDEC_STRIDE
|
||||||
|
ldurb w10, [x0, #0 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
|
||||||
|
add w10, w10, w11
|
||||||
|
ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
|
||||||
|
add w12, w12, w13
|
||||||
|
ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
|
||||||
|
add w4, w4, w5
|
||||||
|
add w6, w6, w7
|
||||||
|
add w10, w10, w12, lsl #16
|
||||||
|
add w4, w4, w6, lsl #16
|
||||||
|
ld1 {v0.8b}, [x2]
|
||||||
|
add x10, x10, x4, lsl #32
|
||||||
|
uaddlp v0.4h, v0.8b // s0, s1
|
||||||
|
mov v1.d[0], x10 // s2, s3
|
||||||
|
add v3.4h, v0.4h, v1.4h
|
||||||
|
addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
|
||||||
|
addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
|
||||||
|
uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
|
||||||
|
uzp1 v1.2d, v1.2d, v1.2d
|
||||||
|
uzp1 v0.2d, v0.2d, v0.2d
|
||||||
|
rshrn v3.8b, v1.8h, #3
|
||||||
|
rshrn v2.8b, v0.8h, #2
|
||||||
|
uzp1 v0.8b, v3.8b, v2.8b
|
||||||
|
uzp2 v1.8b, v2.8b, v3.8b
|
||||||
|
pred8x8c_dc_end:
|
||||||
|
add x2, x0, #2 * FDEC_STRIDE
|
||||||
|
add x4, x0, #4 * FDEC_STRIDE
|
||||||
|
add x5, x0, #6 * FDEC_STRIDE
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
st1 {v0.8b}, [x2], x1
|
||||||
|
st1 {v0.8b}, [x0]
|
||||||
|
st1 {v0.8b}, [x2]
|
||||||
|
st1 {v1.8b}, [x4], x1
|
||||||
|
st1 {v1.8b}, [x5], x1
|
||||||
|
st1 {v1.8b}, [x4]
|
||||||
|
st1 {v1.8b}, [x5]
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_h_neon, export=1
|
||||||
|
sub x1, x0, #1
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
.rept 4
|
||||||
|
ld1r {v0.8b}, [x1], x7
|
||||||
|
ld1r {v1.8b}, [x1], x7
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
st1 {v1.8b}, [x0], x7
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_v_aarch64, export=1
|
||||||
|
ldur x1, [x0, #-FDEC_STRIDE]
|
||||||
|
.irp c, 0,1,2,3,4,5,6,7
|
||||||
|
str x1, [x0, #\c * FDEC_STRIDE]
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_p_neon, export=1
|
||||||
|
sub x3, x0, #FDEC_STRIDE
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
add x2, x3, #4
|
||||||
|
sub x3, x3, #1
|
||||||
|
ld1 {v0.s}[0], [x3]
|
||||||
|
ld1 {v2.s}[0], [x2], x1
|
||||||
|
ldcol.8 v0, x3, x1, 4, hi=1
|
||||||
|
add x3, x3, x1
|
||||||
|
ldcol.8 v3, x3, x1, 4
|
||||||
|
movrel x4, p8weight
|
||||||
|
movrel x5, p16weight
|
||||||
|
uaddl v4.8h, v2.8b, v3.8b
|
||||||
|
rev32 v0.8b, v0.8b
|
||||||
|
trn1 v2.2s, v2.2s, v3.2s
|
||||||
|
ld1 {v7.8h}, [x4]
|
||||||
|
usubl v2.8h, v2.8b, v0.8b
|
||||||
|
mul v2.8h, v2.8h, v7.8h
|
||||||
|
ld1 {v0.8h}, [x5]
|
||||||
|
saddlp v2.4s, v2.8h
|
||||||
|
addp v2.4s, v2.4s, v2.4s
|
||||||
|
shl v3.2s, v2.2s, #4
|
||||||
|
add v2.2s, v2.2s, v3.2s
|
||||||
|
rshrn v5.4h, v2.4s, #5 // b, c, x, x
|
||||||
|
addp v2.4h, v5.4h, v5.4h
|
||||||
|
shl v3.4h, v2.4h, #2
|
||||||
|
sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
|
||||||
|
rev64 v4.4h, v4.4h
|
||||||
|
add v4.4h, v4.4h, v0.4h
|
||||||
|
shl v2.4h, v4.4h, #4 // a
|
||||||
|
sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
|
||||||
|
ext v0.16b, v0.16b, v0.16b, #14
|
||||||
|
sub v6.4h, v5.4h, v3.4h
|
||||||
|
mov v0.h[0], wzr
|
||||||
|
mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
|
||||||
|
dup v1.8h, v2.h[0] // pix
|
||||||
|
dup v2.8h, v5.h[1] // c
|
||||||
|
add v1.8h, v1.8h, v0.8h // pix + x*b
|
||||||
|
mov x3, #8
|
||||||
|
1:
|
||||||
|
subs x3, x3, #1
|
||||||
|
sqshrun v0.8b, v1.8h, #5
|
||||||
|
add v1.8h, v1.8h, v2.8h
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
b.ne 1b
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
.macro loadsum4 wd, t1, t2, t3, x, idx
|
||||||
|
.if \idx == 0
|
||||||
|
ldurb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
|
||||||
|
.else
|
||||||
|
ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
|
||||||
|
.endif
|
||||||
|
ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
|
||||||
|
ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
|
||||||
|
ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
|
||||||
|
add \wd, \wd, \t1
|
||||||
|
add \t1, \t2, \t3
|
||||||
|
add \wd, \wd, \t1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function predict_8x16c_h_neon, export=1
|
||||||
|
sub x2, x0, #1
|
||||||
|
add x3, x0, #FDEC_STRIDE - 1
|
||||||
|
mov x7, #2 * FDEC_STRIDE
|
||||||
|
add x1, x0, #FDEC_STRIDE
|
||||||
|
.rept 4
|
||||||
|
ld1r {v0.8b}, [x2], x7
|
||||||
|
ld1r {v1.8b}, [x3], x7
|
||||||
|
ld1r {v2.8b}, [x2], x7
|
||||||
|
ld1r {v3.8b}, [x3], x7
|
||||||
|
st1 {v0.8b}, [x0], x7
|
||||||
|
st1 {v1.8b}, [x1], x7
|
||||||
|
st1 {v2.8b}, [x0], x7
|
||||||
|
st1 {v3.8b}, [x1], x7
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x16c_v_neon, export=1
|
||||||
|
sub x1, x0, #FDEC_STRIDE
|
||||||
|
mov x2, #2 * FDEC_STRIDE
|
||||||
|
ld1 {v0.8b}, [x1], x2
|
||||||
|
.rept 8
|
||||||
|
st1 {v0.8b}, [x0], x2
|
||||||
|
st1 {v0.8b}, [x1], x2
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x16c_p_neon, export=1
|
||||||
|
movrel x4, p16weight
|
||||||
|
ld1 {v17.8h}, [x4]
|
||||||
|
sub x3, x0, #FDEC_STRIDE
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
add x2, x3, #4
|
||||||
|
sub x3, x3, #1
|
||||||
|
|
||||||
|
ld1 {v0.8b}, [x3]
|
||||||
|
ld1 {v2.8b}, [x2], x1
|
||||||
|
ldcol.8 v1, x3, x1
|
||||||
|
add x3, x3, x1
|
||||||
|
ldcol.8 v3, x3, x1
|
||||||
|
ext v4.8b, v2.8b, v2.8b, #3
|
||||||
|
ext v5.8b, v3.8b, v3.8b, #7
|
||||||
|
rev32 v0.8b, v0.8b
|
||||||
|
rev64 v1.8b, v1.8b
|
||||||
|
|
||||||
|
uaddl v4.8h, v5.8b, v4.8b // a * 1/16
|
||||||
|
|
||||||
|
usubl v2.8h, v2.8b, v0.8b
|
||||||
|
mul v2.8h, v2.8h, v17.8h
|
||||||
|
saddlp v2.4s, v2.8h
|
||||||
|
addp v2.4s, v2.4s, v2.4s // H
|
||||||
|
|
||||||
|
usubl v3.8h, v3.8b, v1.8b
|
||||||
|
mul v3.8h, v3.8h, v17.8h
|
||||||
|
saddlp v3.4s, v3.8h
|
||||||
|
addp v3.4s, v3.4s, v3.4s
|
||||||
|
addp v3.4s, v3.4s, v3.4s // V
|
||||||
|
|
||||||
|
ext v17.16b, v17.16b, v17.16b, #14
|
||||||
|
|
||||||
|
shl v4.4h, v4.4h, #4 // a
|
||||||
|
shl v6.2s, v2.2s, #4 // 16 * H
|
||||||
|
shl v7.2s, v3.2s, #2 // 4 * V
|
||||||
|
add v2.2s, v2.2s, v6.2s // 17 * H
|
||||||
|
add v3.2s, v3.2s, v7.2s // 5 * V
|
||||||
|
rshrn v2.4h, v2.4s, #5 // b
|
||||||
|
rshrn v3.4h, v3.4s, #6 // c
|
||||||
|
|
||||||
|
mov v17.h[0], wzr
|
||||||
|
|
||||||
|
sub v4.4h, v4.4h, v2.4h // a - b
|
||||||
|
shl v6.4h, v2.4h, #1 // 2 * b
|
||||||
|
add v4.4h, v4.4h, v3.4h // a - b + c
|
||||||
|
shl v7.4h, v3.4h, #3 // 8 * c
|
||||||
|
sub v4.4h, v4.4h, v6.4h // a - 3b + c
|
||||||
|
sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
|
||||||
|
|
||||||
|
mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
|
||||||
|
dup v1.8h, v4.h[0] // i00
|
||||||
|
dup v2.8h, v3.h[0] // c
|
||||||
|
add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
|
||||||
|
mov x3, #16
|
||||||
|
1:
|
||||||
|
subs x3, x3, #2
|
||||||
|
sqrshrun v4.8b, v1.8h, #5
|
||||||
|
add v1.8h, v1.8h, v2.8h
|
||||||
|
sqrshrun v5.8b, v1.8h, #5
|
||||||
|
st1 {v4.8b}, [x0], x1
|
||||||
|
add v1.8h, v1.8h, v2.8h
|
||||||
|
st1 {v5.8b}, [x0], x1
|
||||||
|
b.ne 1b
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x16c_dc_neon, export=1
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
sub x10, x0, #FDEC_STRIDE
|
||||||
|
loadsum4 w2, w3, w4, w5, x0, 0
|
||||||
|
ld1 {v6.8b}, [x10]
|
||||||
|
loadsum4 w6, w7, w8, w9, x0, 4
|
||||||
|
uaddlp v6.4h, v6.8b
|
||||||
|
dup v22.8h, w2 // s2
|
||||||
|
dup v23.8h, w6 // s3
|
||||||
|
loadsum4 w2, w3, w4, w5, x0, 8
|
||||||
|
addp v6.4h, v6.4h, v6.4h // s0, s1
|
||||||
|
loadsum4 w6, w7, w8, w9, x0, 12
|
||||||
|
dup v20.8h, v6.h[0] // s0
|
||||||
|
dup v21.8h, v6.h[1] // s1
|
||||||
|
dup v24.8h, w2 // s4
|
||||||
|
dup v25.8h, w6 // s5
|
||||||
|
|
||||||
|
ext v16.16b, v20.16b, v21.16b, #8
|
||||||
|
ext v17.16b, v22.16b, v21.16b, #8
|
||||||
|
ext v1.16b, v23.16b, v21.16b, #8
|
||||||
|
ext v2.16b, v24.16b, v21.16b, #8
|
||||||
|
ext v3.16b, v25.16b, v21.16b, #8
|
||||||
|
|
||||||
|
add v0.8h, v16.8h, v17.8h
|
||||||
|
add v1.8h, v1.8h, v23.8h
|
||||||
|
add v2.8h, v2.8h, v24.8h
|
||||||
|
add v3.8h, v3.8h, v25.8h
|
||||||
|
|
||||||
|
rshrn v0.8b, v0.8h, #3
|
||||||
|
rshrn v1.8b, v1.8h, #3
|
||||||
|
rshrn v2.8b, v2.8h, #3
|
||||||
|
rshrn v3.8b, v3.8h, #3
|
||||||
|
|
||||||
|
add x11, x0, #4 * FDEC_STRIDE
|
||||||
|
add x12, x0, #8 * FDEC_STRIDE
|
||||||
|
add x13, x0, #12 * FDEC_STRIDE
|
||||||
|
.rept 4
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
st1 {v1.8b}, [x11], x1
|
||||||
|
st1 {v2.8b}, [x12], x1
|
||||||
|
st1 {v3.8b}, [x13], x1
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x16c_dc_left_neon, export=1
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
ldurb w2, [x0, # 0 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
|
||||||
|
add w2, w2, w3
|
||||||
|
|
||||||
|
ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
|
||||||
|
add w4, w4, w5
|
||||||
|
ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
|
||||||
|
add w2, w2, w4
|
||||||
|
ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
|
||||||
|
dup v0.8h, w2
|
||||||
|
add w6, w6, w7
|
||||||
|
rshrn v0.8b, v0.8h, #2
|
||||||
|
add w8, w8, w9
|
||||||
|
|
||||||
|
ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
|
||||||
|
add w6, w6, w8
|
||||||
|
ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
|
||||||
|
dup v1.8h, w6
|
||||||
|
add w10, w10, w11
|
||||||
|
rshrn v1.8b, v1.8h, #2
|
||||||
|
add w12, w12, w13
|
||||||
|
|
||||||
|
ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
|
||||||
|
add w10, w10, w12
|
||||||
|
ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
|
||||||
|
ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
|
||||||
|
dup v2.8h, w10
|
||||||
|
add w2, w2, w3
|
||||||
|
rshrn v2.8b, v2.8h, #2
|
||||||
|
add w4, w4, w5
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
add w2, w2, w4
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
dup v3.8h, w2
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
rshrn v3.8b, v3.8h, #2
|
||||||
|
|
||||||
|
.irp idx, 1, 2, 3
|
||||||
|
.rept 4
|
||||||
|
st1 {v\idx\().8b}, [x0], x1
|
||||||
|
.endr
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x16c_dc_top_neon, export=1
|
||||||
|
sub x2, x0, #FDEC_STRIDE
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
ld1 {v0.8b}, [x2]
|
||||||
|
uaddlp v0.4h, v0.8b
|
||||||
|
addp v0.4h, v0.4h, v0.4h
|
||||||
|
rshrn v4.8b, v0.8h, #2
|
||||||
|
dup v0.8b, v4.b[0]
|
||||||
|
dup v1.8b, v4.b[1]
|
||||||
|
ext v0.8b, v0.8b, v1.8b, #4
|
||||||
|
.rept 16
|
||||||
|
st1 {v0.8b}, [x0], x1
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
function predict_16x16_dc_top_neon, export=1
|
||||||
|
sub x2, x0, #FDEC_STRIDE
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
ld1 {v0.16b}, [x2]
|
||||||
|
uaddlv h0, v0.16b
|
||||||
|
rshrn v0.8b, v0.8h, #4
|
||||||
|
dup v0.16b, v0.b[0]
|
||||||
|
b pred16x16_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_dc_left_neon, export=1
|
||||||
|
sub x2, x0, #1
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
ldcol.16 v0, x2, x1
|
||||||
|
uaddlv h0, v0.16b
|
||||||
|
rshrn v0.8b, v0.8h, #4
|
||||||
|
dup v0.16b, v0.b[0]
|
||||||
|
b pred16x16_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_dc_neon, export=1
|
||||||
|
sub x3, x0, #FDEC_STRIDE
|
||||||
|
sub x2, x0, #1
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
ld1 {v0.16b}, [x3]
|
||||||
|
ldcol.16 v1, x2, x1
|
||||||
|
uaddlv h0, v0.16b
|
||||||
|
uaddlv h1, v1.16b
|
||||||
|
add v0.4h, v0.4h, v1.4h
|
||||||
|
rshrn v0.8b, v0.8h, #5
|
||||||
|
dup v0.16b, v0.b[0]
|
||||||
|
pred16x16_dc_end:
|
||||||
|
.rept 16
|
||||||
|
st1 {v0.16b}, [x0], x1
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_h_neon, export=1
|
||||||
|
sub x1, x0, #1
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
.rept 8
|
||||||
|
ld1r {v0.16b}, [x1], x7
|
||||||
|
ld1r {v1.16b}, [x1], x7
|
||||||
|
st1 {v0.16b}, [x0], x7
|
||||||
|
st1 {v1.16b}, [x0], x7
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_v_neon, export=1
|
||||||
|
sub x0, x0, #FDEC_STRIDE
|
||||||
|
mov x7, #FDEC_STRIDE
|
||||||
|
ld1 {v0.16b}, [x0], x7
|
||||||
|
.rept 16
|
||||||
|
st1 {v0.16b}, [x0], x7
|
||||||
|
.endr
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_p_neon, export=1
|
||||||
|
sub x3, x0, #FDEC_STRIDE
|
||||||
|
mov x1, #FDEC_STRIDE
|
||||||
|
add x2, x3, #8
|
||||||
|
sub x3, x3, #1
|
||||||
|
ld1 {v0.8b}, [x3]
|
||||||
|
ld1 {v2.8b}, [x2], x1
|
||||||
|
ldcol.8 v1, x3, x1
|
||||||
|
add x3, x3, x1
|
||||||
|
ldcol.8 v3, x3, x1
|
||||||
|
rev64 v0.8b, v0.8b
|
||||||
|
rev64 v1.8b, v1.8b
|
||||||
|
movrel x4, p16weight
|
||||||
|
uaddl v4.8h, v2.8b, v3.8b
|
||||||
|
ld1 {v7.8h}, [x4]
|
||||||
|
usubl v2.8h, v2.8b, v0.8b
|
||||||
|
usubl v3.8h, v3.8b, v1.8b
|
||||||
|
mul v2.8h, v2.8h, v7.8h
|
||||||
|
mul v3.8h, v3.8h, v7.8h
|
||||||
|
saddlp v2.4s, v2.8h
|
||||||
|
saddlp v3.4s, v3.8h
|
||||||
|
addp v2.4s, v2.4s, v3.4s
|
||||||
|
addp v2.4s, v2.4s, v2.4s
|
||||||
|
shl v3.2s, v2.2s, #2
|
||||||
|
add v2.2s, v2.2s, v3.2s
|
||||||
|
rshrn v5.4h, v2.4s, #6 // b, c, x, x
|
||||||
|
addp v2.4h, v5.4h, v5.4h
|
||||||
|
shl v3.4h, v2.4h, #3
|
||||||
|
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
|
||||||
|
ext v4.16b, v4.16b, v4.16b, #14
|
||||||
|
add v4.4h, v4.4h, v7.4h
|
||||||
|
shl v2.4h, v4.4h, #4 // a
|
||||||
|
sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
|
||||||
|
ext v7.16b, v7.16b, v7.16b, #14
|
||||||
|
mov v7.h[0], wzr
|
||||||
|
dup v3.8h, v5.h[0]
|
||||||
|
mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
|
||||||
|
dup v1.8h, v2.h[0] // pix
|
||||||
|
dup v2.8h, v5.h[1] // c
|
||||||
|
shl v3.8h, v3.8h, #3
|
||||||
|
add v1.8h, v1.8h, v0.8h // pix + x*b
|
||||||
|
add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
|
||||||
|
mov x3, #16
|
||||||
|
1:
|
||||||
|
subs x3, x3, #1
|
||||||
|
sqshrun v0.8b, v1.8h, #5
|
||||||
|
add v1.8h, v1.8h, v2.8h
|
||||||
|
sqshrun2 v0.16b, v3.8h, #5
|
||||||
|
add v3.8h, v3.8h, v2.8h
|
||||||
|
st1 {v0.16b}, [x0], x1
|
||||||
|
b.ne 1b
|
||||||
|
ret
|
||||||
|
endfunc
|
||||||
116
common/aarch64/predict-c.c
Normal file
116
common/aarch64/predict-c.c
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict.c: aarch64 intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "predict.h"
|
||||||
|
#include "pixel.h"
|
||||||
|
|
||||||
|
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] )
|
||||||
|
{
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
if( cpu&X264_CPU_ARMV8 )
|
||||||
|
{
|
||||||
|
pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64;
|
||||||
|
pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cpu&X264_CPU_NEON )
|
||||||
|
{
|
||||||
|
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon;
|
||||||
|
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
|
||||||
|
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
|
||||||
|
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon;
|
||||||
|
}
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
if( cpu&X264_CPU_ARMV8 )
|
||||||
|
{
|
||||||
|
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_aarch64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
|
||||||
|
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
|
||||||
|
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
|
||||||
|
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
|
||||||
|
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_neon;
|
||||||
|
pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_neon;
|
||||||
|
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon;
|
||||||
|
pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_neon;
|
||||||
|
pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
|
||||||
|
pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
|
||||||
|
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
|
||||||
|
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
|
||||||
|
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
|
||||||
|
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
|
||||||
|
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
|
||||||
|
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
|
||||||
|
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
|
||||||
|
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
|
||||||
|
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
|
||||||
|
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
|
||||||
|
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
|
||||||
|
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
|
||||||
|
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
119
common/aarch64/predict.h
Normal file
119
common/aarch64/predict.h
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict.h: aarch64 intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_PREDICT_H
|
||||||
|
#define X264_AARCH64_PREDICT_H
|
||||||
|
|
||||||
|
#define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64)
|
||||||
|
void x264_predict_4x4_h_aarch64( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64)
|
||||||
|
void x264_predict_4x4_v_aarch64( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64)
|
||||||
|
void x264_predict_8x8c_v_aarch64( uint8_t *src );
|
||||||
|
|
||||||
|
// for the merged 4x4 intra sad/satd which expects unified suffix
|
||||||
|
#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
|
||||||
|
#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
|
||||||
|
#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
|
||||||
|
|
||||||
|
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
|
||||||
|
void x264_predict_4x4_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon)
|
||||||
|
void x264_predict_4x4_ddr_neon( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
|
||||||
|
void x264_predict_4x4_ddl_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
|
||||||
|
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
|
||||||
|
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
|
||||||
|
void x264_predict_8x8c_p_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon)
|
||||||
|
void x264_predict_8x16c_dc_left_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
|
||||||
|
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
|
||||||
|
void x264_predict_8x16c_p_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
|
||||||
|
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
|
||||||
|
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
|
||||||
|
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
|
||||||
|
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
|
||||||
|
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
|
||||||
|
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
|
||||||
|
void x264_predict_16x16_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
|
||||||
|
void x264_predict_16x16_dc_left_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
|
||||||
|
void x264_predict_16x16_p_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon)
|
||||||
|
void x264_predict_4x4_dc_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
|
||||||
|
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
|
||||||
|
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
|
||||||
|
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
|
||||||
|
void x264_predict_8x8c_dc_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
|
||||||
|
void x264_predict_8x8c_h_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon)
|
||||||
|
void x264_predict_8x16c_v_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
|
||||||
|
void x264_predict_8x16c_h_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon)
|
||||||
|
void x264_predict_8x16c_dc_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
|
||||||
|
void x264_predict_16x16_v_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
|
||||||
|
void x264_predict_16x16_h_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
|
||||||
|
void x264_predict_16x16_dc_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_init_aarch64 x264_template(predict_4x4_init_aarch64)
|
||||||
|
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] );
|
||||||
|
#define x264_predict_8x8_init_aarch64 x264_template(predict_8x8_init_aarch64)
|
||||||
|
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
|
||||||
|
#define x264_predict_8x8c_init_aarch64 x264_template(predict_8x8c_init_aarch64)
|
||||||
|
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
|
||||||
|
#define x264_predict_8x16c_init_aarch64 x264_template(predict_8x16c_init_aarch64)
|
||||||
|
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
|
||||||
|
#define x264_predict_16x16_init_aarch64 x264_template(predict_16x16_init_aarch64)
|
||||||
|
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
|
||||||
|
|
||||||
|
#endif /* X264_AARCH64_PREDICT_H */
|
||||||
1169
common/aarch64/quant-a.S
Normal file
1169
common/aarch64/quant-a.S
Normal file
File diff suppressed because it is too large
Load Diff
95
common/aarch64/quant.h
Normal file
95
common/aarch64/quant.h
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* quant.h: arm quantization and level-run
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2005-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_AARCH64_QUANT_H
|
||||||
|
#define X264_AARCH64_QUANT_H
|
||||||
|
|
||||||
|
#define x264_quant_2x2_dc_aarch64 x264_template(quant_2x2_dc_aarch64)
|
||||||
|
int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
|
||||||
|
|
||||||
|
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
|
||||||
|
int x264_quant_2x2_dc_neon( dctcoef dct[4], int mf, int bias );
|
||||||
|
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
|
||||||
|
int x264_quant_4x4_dc_neon( dctcoef dct[16], int mf, int bias );
|
||||||
|
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
|
||||||
|
int x264_quant_4x4_neon( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
|
||||||
|
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
|
||||||
|
int x264_quant_4x4x4_neon( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
|
||||||
|
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
|
||||||
|
int x264_quant_8x8_neon( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
|
||||||
|
|
||||||
|
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
|
||||||
|
void x264_dequant_4x4_dc_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
|
||||||
|
void x264_dequant_4x4_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
|
||||||
|
void x264_dequant_8x8_neon( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||||
|
|
||||||
|
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
|
||||||
|
int x264_decimate_score15_neon( dctcoef * );
|
||||||
|
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
|
||||||
|
int x264_decimate_score16_neon( dctcoef * );
|
||||||
|
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
|
||||||
|
int x264_decimate_score64_neon( dctcoef * );
|
||||||
|
|
||||||
|
// BIT DEPTH = 8
|
||||||
|
#define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64)
|
||||||
|
int x264_coeff_last4_aarch64( dctcoef * );
|
||||||
|
#define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64)
|
||||||
|
int x264_coeff_last8_aarch64( dctcoef * );
|
||||||
|
|
||||||
|
// BIT DEPTH = 10
|
||||||
|
#define x264_coeff_last4_neon x264_template(coeff_last4_neon)
|
||||||
|
int x264_coeff_last4_neon( dctcoef * );
|
||||||
|
#define x264_coeff_last8_neon x264_template(coeff_last8_neon)
|
||||||
|
int x264_coeff_last8_neon( dctcoef * );
|
||||||
|
|
||||||
|
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
|
||||||
|
int x264_coeff_last15_neon( dctcoef * );
|
||||||
|
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
|
||||||
|
int x264_coeff_last16_neon( dctcoef * );
|
||||||
|
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
|
||||||
|
int x264_coeff_last64_neon( dctcoef * );
|
||||||
|
|
||||||
|
// BIT_DEPTH = 8
|
||||||
|
#define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64)
|
||||||
|
int x264_coeff_level_run4_aarch64( dctcoef *, x264_run_level_t * );
|
||||||
|
|
||||||
|
// BIT_DEPTH = 10
|
||||||
|
#define x264_coeff_level_run4_neon x264_template(coeff_level_run4_neon)
|
||||||
|
int x264_coeff_level_run4_neon( dctcoef *, x264_run_level_t * );
|
||||||
|
|
||||||
|
#define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon)
|
||||||
|
int x264_coeff_level_run8_neon( dctcoef *, x264_run_level_t * );
|
||||||
|
#define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon)
|
||||||
|
int x264_coeff_level_run15_neon( dctcoef *, x264_run_level_t * );
|
||||||
|
#define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon)
|
||||||
|
int x264_coeff_level_run16_neon( dctcoef *, x264_run_level_t * );
|
||||||
|
|
||||||
|
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
|
||||||
|
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
|
||||||
|
|
||||||
|
#endif
|
||||||
263
common/arm/asm.S
Normal file
263
common/arm/asm.S
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* asm.S: arm utility macros
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2008-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Mans Rullgard <mans@mansr.com>
|
||||||
|
* David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
|
.syntax unified
|
||||||
|
|
||||||
|
#ifdef __ELF__
|
||||||
|
.arch armv7-a
|
||||||
|
.fpu neon
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GLUE(a, b) a ## b
|
||||||
|
#define JOIN(a, b) GLUE(a, b)
|
||||||
|
|
||||||
|
#ifdef PREFIX
|
||||||
|
# define BASE _x264_
|
||||||
|
# define SYM_PREFIX _
|
||||||
|
#else
|
||||||
|
# define BASE x264_
|
||||||
|
# define SYM_PREFIX
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef BIT_DEPTH
|
||||||
|
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
|
||||||
|
#else
|
||||||
|
# define EXTERN_ASM BASE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define X(s) JOIN(EXTERN_ASM, s)
|
||||||
|
#define X264(s) JOIN(BASE, s)
|
||||||
|
#define EXT(s) JOIN(SYM_PREFIX, s)
|
||||||
|
|
||||||
|
#ifdef __ELF__
|
||||||
|
# define ELF
|
||||||
|
#else
|
||||||
|
# define ELF @
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __MACH__
|
||||||
|
# define MACH
|
||||||
|
# define NONMACH @
|
||||||
|
#else
|
||||||
|
# define MACH @
|
||||||
|
# define NONMACH
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_AS_FUNC
|
||||||
|
# define FUNC
|
||||||
|
#else
|
||||||
|
# define FUNC @
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if SYS_LINUX || SYS_OPENBSD
|
||||||
|
#define HAVE_SECTION_DATA_REL_RO 1
|
||||||
|
#else
|
||||||
|
#define HAVE_SECTION_DATA_REL_RO 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.macro require8, val=1
|
||||||
|
ELF .eabi_attribute 24, \val
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro preserve8, val=1
|
||||||
|
ELF .eabi_attribute 25, \val
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro function name, export=1
|
||||||
|
.macro endfunc
|
||||||
|
.if \export
|
||||||
|
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
|
||||||
|
.else
|
||||||
|
ELF .size \name, . - \name
|
||||||
|
.endif
|
||||||
|
FUNC .endfunc
|
||||||
|
.purgem endfunc
|
||||||
|
.endm
|
||||||
|
.text
|
||||||
|
.align 2
|
||||||
|
.if \export == 1
|
||||||
|
.global EXTERN_ASM\name
|
||||||
|
ELF .hidden EXTERN_ASM\name
|
||||||
|
ELF .type EXTERN_ASM\name, %function
|
||||||
|
FUNC .func EXTERN_ASM\name
|
||||||
|
EXTERN_ASM\name:
|
||||||
|
.else
|
||||||
|
ELF .hidden \name
|
||||||
|
ELF .type \name, %function
|
||||||
|
FUNC .func \name
|
||||||
|
\name:
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro const name, align=2, relocate=0
|
||||||
|
.macro endconst
|
||||||
|
ELF .size \name, . - \name
|
||||||
|
.purgem endconst
|
||||||
|
.endm
|
||||||
|
.if HAVE_SECTION_DATA_REL_RO && \relocate
|
||||||
|
.section .data.rel.ro
|
||||||
|
.else
|
||||||
|
NONMACH .section .rodata
|
||||||
|
MACH .const_data
|
||||||
|
.endif
|
||||||
|
.align \align
|
||||||
|
\name:
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro movrel rd, val
|
||||||
|
#if defined(PIC)
|
||||||
|
ldr \rd, 1f
|
||||||
|
b 2f
|
||||||
|
1:
|
||||||
|
@ FIXME: thumb
|
||||||
|
.word \val - (2f + 8)
|
||||||
|
2:
|
||||||
|
add \rd, \rd, pc
|
||||||
|
#elif HAVE_ARMV6T2
|
||||||
|
movw \rd, #:lower16:\val
|
||||||
|
movt \rd, #:upper16:\val
|
||||||
|
#else
|
||||||
|
ldr \rd, =\val
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro movrelx rd, val, got
|
||||||
|
#if defined(PIC) && defined(__ELF__)
|
||||||
|
ldr \got, 2f
|
||||||
|
ldr \rd, 1f
|
||||||
|
b 3f
|
||||||
|
1:
|
||||||
|
@ FIXME: thumb
|
||||||
|
.word \val(GOT)
|
||||||
|
2:
|
||||||
|
.word _GLOBAL_OFFSET_TABLE_ - (3f + 8)
|
||||||
|
3:
|
||||||
|
add \got, \got, pc
|
||||||
|
ldr \rd, [\got, \rd]
|
||||||
|
#elif defined(PIC) && defined(__APPLE__)
|
||||||
|
ldr \rd, 1f
|
||||||
|
b 2f
|
||||||
|
1:
|
||||||
|
@ FIXME: thumb
|
||||||
|
.word 3f - (2f + 8)
|
||||||
|
2:
|
||||||
|
ldr \rd, [pc, \rd]
|
||||||
|
.non_lazy_symbol_pointer
|
||||||
|
3:
|
||||||
|
.indirect_symbol \val
|
||||||
|
.word 0
|
||||||
|
.text
|
||||||
|
#else
|
||||||
|
movrel \rd, \val
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro movconst rd, val
|
||||||
|
#if HAVE_ARMV6T2
|
||||||
|
movw \rd, #:lower16:\val
|
||||||
|
.if \val >> 16
|
||||||
|
movt \rd, #:upper16:\val
|
||||||
|
.endif
|
||||||
|
#else
|
||||||
|
ldr \rd, =\val
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#define FENC_STRIDE 16
|
||||||
|
#define FDEC_STRIDE 32
|
||||||
|
|
||||||
|
.macro HORIZ_ADD dest, a, b
|
||||||
|
.ifnb \b
|
||||||
|
vadd.u16 \a, \a, \b
|
||||||
|
.endif
|
||||||
|
vpaddl.u16 \a, \a
|
||||||
|
vpaddl.u32 \dest, \a
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SUMSUB_AB sum, diff, a, b
|
||||||
|
vadd.s16 \sum, \a, \b
|
||||||
|
vsub.s16 \diff, \a, \b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
|
||||||
|
SUMSUB_AB \s1, \d1, \a, \b
|
||||||
|
SUMSUB_AB \s2, \d2, \c, \d
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ABS2 a b
|
||||||
|
vabs.s16 \a, \a
|
||||||
|
vabs.s16 \b, \b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
|
||||||
|
// op = sumsub/amax (sum and diff / maximum of absolutes)
|
||||||
|
// d1/2 = destination registers
|
||||||
|
// s1/2 = source registers
|
||||||
|
.macro HADAMARD dist, op, d1, d2, s1, s2
|
||||||
|
.if \dist == 1
|
||||||
|
vtrn.16 \s1, \s2
|
||||||
|
.else
|
||||||
|
vtrn.32 \s1, \s2
|
||||||
|
.endif
|
||||||
|
.ifc \op, sumsub
|
||||||
|
SUMSUB_AB \d1, \d2, \s1, \s2
|
||||||
|
.else
|
||||||
|
vabs.s16 \s1, \s1
|
||||||
|
vabs.s16 \s2, \s2
|
||||||
|
vmax.s16 \d1, \s1, \s2
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
|
||||||
|
vtrn.32 \r0, \r4
|
||||||
|
vtrn.32 \r1, \r5
|
||||||
|
vtrn.32 \r2, \r6
|
||||||
|
vtrn.32 \r3, \r7
|
||||||
|
vtrn.16 \r0, \r2
|
||||||
|
vtrn.16 \r1, \r3
|
||||||
|
vtrn.16 \r4, \r6
|
||||||
|
vtrn.16 \r5, \r7
|
||||||
|
vtrn.8 \r0, \r1
|
||||||
|
vtrn.8 \r2, \r3
|
||||||
|
vtrn.8 \r4, \r5
|
||||||
|
vtrn.8 \r6, \r7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro TRANSPOSE4x4 r0 r1 r2 r3
|
||||||
|
vtrn.16 \r0, \r2
|
||||||
|
vtrn.16 \r1, \r3
|
||||||
|
vtrn.8 \r0, \r1
|
||||||
|
vtrn.8 \r2, \r3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro TRANSPOSE4x4_16 d0 d1 d2 d3
|
||||||
|
vtrn.32 \d0, \d2
|
||||||
|
vtrn.32 \d1, \d3
|
||||||
|
vtrn.16 \d0, \d1
|
||||||
|
vtrn.16 \d2, \d3
|
||||||
|
.endm
|
||||||
84
common/arm/bitstream-a.S
Normal file
84
common/arm/bitstream-a.S
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* bitstream-a.S: arm bitstream functions
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2014-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
function nal_escape_neon
|
||||||
|
push {r4-r5,lr}
|
||||||
|
vmov.u8 q0, #0xff
|
||||||
|
vmov.u8 q8, #4
|
||||||
|
mov r3, #3
|
||||||
|
subs lr, r1, r2
|
||||||
|
beq 99f
|
||||||
|
0:
|
||||||
|
cmn lr, #15
|
||||||
|
blt 16f
|
||||||
|
mov r1, r2
|
||||||
|
b 100f
|
||||||
|
16:
|
||||||
|
vld1.8 {q1}, [r1]!
|
||||||
|
vext.8 q2, q0, q1, #14
|
||||||
|
vext.8 q3, q0, q1, #15
|
||||||
|
vcgt.u8 q11, q8, q1
|
||||||
|
vceq.u8 q9, q2, #0
|
||||||
|
vceq.u8 q10, q3, #0
|
||||||
|
vand q9, q9, q11
|
||||||
|
vand q9, q9, q10
|
||||||
|
vshrn.u16 d22, q9, #4
|
||||||
|
vmov ip, lr, d22
|
||||||
|
orrs ip, ip, lr
|
||||||
|
beq 16f
|
||||||
|
mov lr, #-16
|
||||||
|
100:
|
||||||
|
vmov.u8 r5, d1[6]
|
||||||
|
vmov.u8 r4, d1[7]
|
||||||
|
orr r5, r4, r5, lsl #8
|
||||||
|
101:
|
||||||
|
ldrb r4, [r1, lr]
|
||||||
|
orr ip, r4, r5, lsl #16
|
||||||
|
cmp ip, #3
|
||||||
|
bhi 102f
|
||||||
|
strb r3, [r0], #1
|
||||||
|
orr r5, r3, r5, lsl #8
|
||||||
|
102:
|
||||||
|
adds lr, lr, #1
|
||||||
|
strb r4, [r0], #1
|
||||||
|
orr r5, r4, r5, lsl #8
|
||||||
|
blt 101b
|
||||||
|
subs lr, r1, r2
|
||||||
|
lsr ip, r5, #8
|
||||||
|
vmov.u8 d1[6], ip
|
||||||
|
vmov.u8 d1[7], r5
|
||||||
|
blt 0b
|
||||||
|
|
||||||
|
pop {r4-r5,pc}
|
||||||
|
16:
|
||||||
|
subs lr, r1, r2
|
||||||
|
vst1.8 {q1}, [r0]!
|
||||||
|
vmov q0, q1
|
||||||
|
blt 0b
|
||||||
|
99:
|
||||||
|
pop {r4-r5,pc}
|
||||||
|
endfunc
|
||||||
32
common/arm/bitstream.h
Normal file
32
common/arm/bitstream.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* bitstream.h: arm bitstream functions
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2017-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_ARM_BITSTREAM_H
|
||||||
|
#define X264_ARM_BITSTREAM_H
|
||||||
|
|
||||||
|
#define x264_nal_escape_neon x264_template(nal_escape_neon)
|
||||||
|
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||||
|
|
||||||
|
#endif
|
||||||
108
common/arm/cpu-a.S
Normal file
108
common/arm/cpu-a.S
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* cpu-a.S: arm cpu detection
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
|
||||||
|
// done in gas because .fpu neon overrides the refusal to assemble
|
||||||
|
// instructions the selected -march/-mcpu doesn't support
|
||||||
|
function cpu_neon_test
|
||||||
|
vadd.i16 q0, q0, q0
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// return: 0 on success
|
||||||
|
// 1 if counters were already enabled
|
||||||
|
// 9 if lo-res counters were already enabled
|
||||||
|
function cpu_enable_armv7_counter, export=0
|
||||||
|
mrc p15, 0, r2, c9, c12, 0 // read PMNC
|
||||||
|
ands r0, r2, #1
|
||||||
|
andne r0, r2, #9
|
||||||
|
|
||||||
|
orr r2, r2, #1 // enable counters
|
||||||
|
bic r2, r2, #8 // full resolution
|
||||||
|
mcreq p15, 0, r2, c9, c12, 0 // write PMNC
|
||||||
|
mov r2, #1 << 31 // enable cycle counter
|
||||||
|
mcr p15, 0, r2, c9, c12, 1 // write CNTENS
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function cpu_disable_armv7_counter, export=0
|
||||||
|
mrc p15, 0, r0, c9, c12, 0 // read PMNC
|
||||||
|
bic r0, r0, #1 // disable counters
|
||||||
|
mcr p15, 0, r0, c9, c12, 0 // write PMNC
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
.macro READ_TIME r
|
||||||
|
mrc p15, 0, \r, c9, c13, 0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// return: 0 if transfers neon -> arm transfers take more than 10 cycles
|
||||||
|
// nonzero otherwise
|
||||||
|
function cpu_fast_neon_mrc_test
|
||||||
|
// check for user access to performance counters
|
||||||
|
mrc p15, 0, r0, c9, c14, 0
|
||||||
|
cmp r0, #0
|
||||||
|
bxeq lr
|
||||||
|
|
||||||
|
push {r4-r6,lr}
|
||||||
|
bl cpu_enable_armv7_counter
|
||||||
|
ands r1, r0, #8
|
||||||
|
mov r3, #0
|
||||||
|
mov ip, #4
|
||||||
|
mov r6, #4
|
||||||
|
moveq r5, #1
|
||||||
|
movne r5, #64
|
||||||
|
|
||||||
|
average_loop:
|
||||||
|
mov r4, r5
|
||||||
|
READ_TIME r1
|
||||||
|
1: subs r4, r4, #1
|
||||||
|
.rept 8
|
||||||
|
vmov.u32 lr, d0[0]
|
||||||
|
add lr, lr, lr
|
||||||
|
.endr
|
||||||
|
bgt 1b
|
||||||
|
READ_TIME r2
|
||||||
|
|
||||||
|
subs r6, r6, #1
|
||||||
|
sub r2, r2, r1
|
||||||
|
cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles
|
||||||
|
addle r3, r3, r2
|
||||||
|
subsle ip, ip, #1
|
||||||
|
bgt average_loop
|
||||||
|
|
||||||
|
// disable counters if we enabled them
|
||||||
|
ands r0, r0, #1
|
||||||
|
bleq cpu_disable_armv7_counter
|
||||||
|
|
||||||
|
lsr r0, r3, #5
|
||||||
|
cmp r0, #10
|
||||||
|
movgt r0, #0
|
||||||
|
pop {r4-r6,pc}
|
||||||
|
endfunc
|
||||||
764
common/arm/dct-a.S
Normal file
764
common/arm/dct-a.S
Normal file
@@ -0,0 +1,764 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* dct-a.S: arm transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Martin Storsjo <martin@martin.st>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
const scan4x4_frame, align=4
|
||||||
|
.byte 0,1, 8,9, 2,3, 4,5
|
||||||
|
.byte 2,3, 8,9, 16,17, 10,11
|
||||||
|
.byte 12,13, 6,7, 14,15, 20,21
|
||||||
|
.byte 10,11, 12,13, 6,7, 14,15
|
||||||
|
endconst
|
||||||
|
|
||||||
|
.text
|
||||||
|
|
||||||
|
// sum = a + (b>>shift) sub = (a>>shift) - b
|
||||||
|
.macro SUMSUB_SHR shift sum sub a b t0 t1
|
||||||
|
vshr.s16 \t0, \b, #\shift
|
||||||
|
vshr.s16 \t1, \a, #\shift
|
||||||
|
vadd.s16 \sum, \a, \t0
|
||||||
|
vsub.s16 \sub, \t1, \b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// sum = (a>>shift) + b sub = a - (b>>shift)
|
||||||
|
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
|
||||||
|
vshr.s16 \t0, \a, #\shift
|
||||||
|
vshr.s16 \t1, \b, #\shift
|
||||||
|
vadd.s16 \sum, \t0, \b
|
||||||
|
vsub.s16 \sub, \a, \t1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// a += 1.5*ma b -= 1.5*mb
|
||||||
|
.macro SUMSUB_15 a b ma mb t0 t1
|
||||||
|
vshr.s16 \t0, \ma, #1
|
||||||
|
vshr.s16 \t1, \mb, #1
|
||||||
|
vadd.s16 \t0, \t0, \ma
|
||||||
|
vadd.s16 \t1, \t1, \mb
|
||||||
|
vadd.s16 \a, \a, \t0
|
||||||
|
vsub.s16 \b, \b, \t1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
function dct4x4dc_neon
|
||||||
|
vld1.64 {d0-d3}, [r0,:128]
|
||||||
|
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
|
||||||
|
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
|
||||||
|
|
||||||
|
vmov.s16 d31, #1
|
||||||
|
HADAMARD 1, sumsub, q2, q3, q0, q1
|
||||||
|
vtrn.32 d4, d5
|
||||||
|
vadd.s16 d16, d4, d31
|
||||||
|
vtrn.32 d6, d7
|
||||||
|
vadd.s16 d17, d6, d31
|
||||||
|
vrhadd.s16 d0, d4, d5
|
||||||
|
vhsub.s16 d1, d16, d5
|
||||||
|
vhsub.s16 d2, d17, d7
|
||||||
|
vrhadd.s16 d3, d6, d7
|
||||||
|
vst1.64 {d0-d3}, [r0,:128]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function idct4x4dc_neon
|
||||||
|
vld1.64 {d0-d3}, [r0,:128]
|
||||||
|
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
|
||||||
|
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
|
||||||
|
|
||||||
|
HADAMARD 1, sumsub, q2, q3, q0, q1
|
||||||
|
HADAMARD 2, sumsub, d0, d1, d4, d5
|
||||||
|
HADAMARD 2, sumsub, d3, d2, d6, d7
|
||||||
|
vst1.64 {d0-d3}, [r0,:128]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
.macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7
|
||||||
|
SUMSUB_AB \d1, \d6, \d5, \d6
|
||||||
|
SUMSUB_AB \d3, \d7, \d4, \d7
|
||||||
|
vadd.s16 \d0, \d3, \d1
|
||||||
|
vadd.s16 \d4, \d7, \d7
|
||||||
|
vadd.s16 \d5, \d6, \d6
|
||||||
|
vsub.s16 \d2, \d3, \d1
|
||||||
|
vadd.s16 \d1, \d4, \d6
|
||||||
|
vsub.s16 \d3, \d7, \d5
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function sub4x4_dct_neon
|
||||||
|
mov r3, #FENC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.32 {d0[]}, [r1,:32], r3
|
||||||
|
vld1.32 {d1[]}, [r2,:32], ip
|
||||||
|
vld1.32 {d2[]}, [r1,:32], r3
|
||||||
|
vsubl.u8 q8, d0, d1
|
||||||
|
vld1.32 {d3[]}, [r2,:32], ip
|
||||||
|
vld1.32 {d4[]}, [r1,:32], r3
|
||||||
|
vsubl.u8 q9, d2, d3
|
||||||
|
vld1.32 {d5[]}, [r2,:32], ip
|
||||||
|
vld1.32 {d6[]}, [r1,:32], r3
|
||||||
|
vsubl.u8 q10, d4, d5
|
||||||
|
vld1.32 {d7[]}, [r2,:32], ip
|
||||||
|
vsubl.u8 q11, d6, d7
|
||||||
|
|
||||||
|
DCT_1D d0, d1, d2, d3, d16, d18, d20, d22
|
||||||
|
TRANSPOSE4x4_16 d0, d1, d2, d3
|
||||||
|
DCT_1D d4, d5, d6, d7, d0, d1, d2, d3
|
||||||
|
vst1.64 {d4-d7}, [r0,:128]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub8x4_dct_neon, export=0
|
||||||
|
vld1.64 {d0}, [r1,:64], r3
|
||||||
|
vld1.64 {d1}, [r2,:64], ip
|
||||||
|
vsubl.u8 q8, d0, d1
|
||||||
|
vld1.64 {d2}, [r1,:64], r3
|
||||||
|
vld1.64 {d3}, [r2,:64], ip
|
||||||
|
vsubl.u8 q9, d2, d3
|
||||||
|
vld1.64 {d4}, [r1,:64], r3
|
||||||
|
vld1.64 {d5}, [r2,:64], ip
|
||||||
|
vsubl.u8 q10, d4, d5
|
||||||
|
vld1.64 {d6}, [r1,:64], r3
|
||||||
|
vld1.64 {d7}, [r2,:64], ip
|
||||||
|
vsubl.u8 q11, d6, d7
|
||||||
|
|
||||||
|
DCT_1D q0, q1, q2, q3, q8, q9, q10, q11
|
||||||
|
TRANSPOSE4x4_16 q0, q1, q2, q3
|
||||||
|
|
||||||
|
SUMSUB_AB q8, q12, q0, q3
|
||||||
|
SUMSUB_AB q9, q10, q1, q2
|
||||||
|
vadd.i16 q13, q12, q12
|
||||||
|
vadd.i16 q11, q10, q10
|
||||||
|
vadd.i16 d0, d16, d18
|
||||||
|
vadd.i16 d1, d26, d20
|
||||||
|
vsub.i16 d2, d16, d18
|
||||||
|
vsub.i16 d3, d24, d22
|
||||||
|
vst1.64 {d0-d1}, [r0,:128]!
|
||||||
|
vadd.i16 d4, d17, d19
|
||||||
|
vadd.i16 d5, d27, d21
|
||||||
|
vst1.64 {d2-d3}, [r0,:128]!
|
||||||
|
vsub.i16 d6, d17, d19
|
||||||
|
vsub.i16 d7, d25, d23
|
||||||
|
vst1.64 {d4-d5}, [r0,:128]!
|
||||||
|
vst1.64 {d6-d7}, [r0,:128]!
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub8x8_dct_neon
|
||||||
|
push {lr}
|
||||||
|
mov r3, #FENC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
pop {lr}
|
||||||
|
b sub8x4_dct_neon
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub16x16_dct_neon
|
||||||
|
push {lr}
|
||||||
|
mov r3, #FENC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
sub r1, r1, #8*FENC_STRIDE-8
|
||||||
|
sub r2, r2, #8*FDEC_STRIDE-8
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
sub r1, r1, #8
|
||||||
|
sub r2, r2, #8
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
sub r1, r1, #8*FENC_STRIDE-8
|
||||||
|
sub r2, r2, #8*FDEC_STRIDE-8
|
||||||
|
bl sub8x4_dct_neon
|
||||||
|
pop {lr}
|
||||||
|
b sub8x4_dct_neon
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
.macro DCT8_1D type
|
||||||
|
SUMSUB_AB q2, q1, q11, q12 // s34/d34
|
||||||
|
SUMSUB_AB q3, q11, q10, q13 // s25/d25
|
||||||
|
SUMSUB_AB q13, q10, q9, q14 // s16/d16
|
||||||
|
SUMSUB_AB q14, q8, q8, q15 // s07/d07
|
||||||
|
|
||||||
|
SUMSUB_AB q9, q2, q14, q2 // a0/a2
|
||||||
|
SUMSUB_AB q12, q14, q13, q3 // a1/a3
|
||||||
|
|
||||||
|
SUMSUB_AB q3, q13, q8, q1 // a6/a5
|
||||||
|
vshr.s16 q0, q10, #1
|
||||||
|
vshr.s16 q15, q11, #1
|
||||||
|
vadd.s16 q0, q0, q10
|
||||||
|
vadd.s16 q15, q15, q11
|
||||||
|
vsub.s16 q3, q3, q0
|
||||||
|
vsub.s16 q13, q13, q15
|
||||||
|
|
||||||
|
SUMSUB_AB q0, q15, q10, q11 // a4/a7
|
||||||
|
vshr.s16 q10, q8, #1
|
||||||
|
vshr.s16 q11, q1, #1
|
||||||
|
vadd.s16 q10, q10, q8
|
||||||
|
vadd.s16 q11, q11, q1
|
||||||
|
vadd.s16 q10, q0, q10
|
||||||
|
vadd.s16 q15, q15, q11
|
||||||
|
|
||||||
|
SUMSUB_AB q8, q12, q9, q12
|
||||||
|
SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1
|
||||||
|
SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1
|
||||||
|
SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function sub8x8_dct8_neon
|
||||||
|
mov r3, #FENC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.64 {d16}, [r1,:64], r3
|
||||||
|
vld1.64 {d17}, [r2,:64], ip
|
||||||
|
vsubl.u8 q8, d16, d17
|
||||||
|
vld1.64 {d18}, [r1,:64], r3
|
||||||
|
vld1.64 {d19}, [r2,:64], ip
|
||||||
|
vsubl.u8 q9, d18, d19
|
||||||
|
vld1.64 {d20}, [r1,:64], r3
|
||||||
|
vld1.64 {d21}, [r2,:64], ip
|
||||||
|
vsubl.u8 q10, d20, d21
|
||||||
|
vld1.64 {d22}, [r1,:64], r3
|
||||||
|
vld1.64 {d23}, [r2,:64], ip
|
||||||
|
vsubl.u8 q11, d22, d23
|
||||||
|
vld1.64 {d24}, [r1,:64], r3
|
||||||
|
vld1.64 {d25}, [r2,:64], ip
|
||||||
|
vsubl.u8 q12, d24, d25
|
||||||
|
vld1.64 {d26}, [r1,:64], r3
|
||||||
|
vld1.64 {d27}, [r2,:64], ip
|
||||||
|
vsubl.u8 q13, d26, d27
|
||||||
|
vld1.64 {d28}, [r1,:64], r3
|
||||||
|
vld1.64 {d29}, [r2,:64], ip
|
||||||
|
vsubl.u8 q14, d28, d29
|
||||||
|
vld1.64 {d30}, [r1,:64], r3
|
||||||
|
vld1.64 {d31}, [r2,:64], ip
|
||||||
|
vsubl.u8 q15, d30, d31
|
||||||
|
|
||||||
|
DCT8_1D row
|
||||||
|
vswp d17, d24 // 8, 12
|
||||||
|
vswp d21, d28 // 10,14
|
||||||
|
vtrn.32 q8, q10
|
||||||
|
vtrn.32 q12, q14
|
||||||
|
|
||||||
|
vswp d19, d26 // 9, 13
|
||||||
|
vswp d23, d30 // 11,15
|
||||||
|
vtrn.32 q9, q11
|
||||||
|
vtrn.32 q13, q15
|
||||||
|
|
||||||
|
vtrn.16 q10, q11
|
||||||
|
vtrn.16 q12, q13
|
||||||
|
vtrn.16 q8, q9
|
||||||
|
vtrn.16 q14, q15
|
||||||
|
DCT8_1D col
|
||||||
|
|
||||||
|
vst1.64 {d16-d19}, [r0,:128]!
|
||||||
|
vst1.64 {d20-d23}, [r0,:128]!
|
||||||
|
vst1.64 {d24-d27}, [r0,:128]!
|
||||||
|
vst1.64 {d28-d31}, [r0,:128]!
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub16x16_dct8_neon
|
||||||
|
push {lr}
|
||||||
|
bl X(sub8x8_dct8_neon)
|
||||||
|
sub r1, r1, #FENC_STRIDE*8 - 8
|
||||||
|
sub r2, r2, #FDEC_STRIDE*8 - 8
|
||||||
|
bl X(sub8x8_dct8_neon)
|
||||||
|
sub r1, r1, #8
|
||||||
|
sub r2, r2, #8
|
||||||
|
bl X(sub8x8_dct8_neon)
|
||||||
|
pop {lr}
|
||||||
|
sub r1, r1, #FENC_STRIDE*8 - 8
|
||||||
|
sub r2, r2, #FDEC_STRIDE*8 - 8
|
||||||
|
b X(sub8x8_dct8_neon)
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
// First part of IDCT (minus final SUMSUB_BA)
|
||||||
|
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
|
||||||
|
SUMSUB_AB \d4, \d5, \d0, \d2
|
||||||
|
vshr.s16 \d7, \d1, #1
|
||||||
|
vshr.s16 \d6, \d3, #1
|
||||||
|
vsub.s16 \d7, \d7, \d3
|
||||||
|
vadd.s16 \d6, \d6, \d1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function add4x4_idct_neon
|
||||||
|
mov r2, #FDEC_STRIDE
|
||||||
|
vld1.64 {d0-d3}, [r1,:128]
|
||||||
|
|
||||||
|
IDCT_1D d4, d5, d6, d7, d0, d1, d2, d3
|
||||||
|
vld1.32 {d30[0]}, [r0,:32], r2
|
||||||
|
SUMSUB_AB q0, q1, q2, q3
|
||||||
|
|
||||||
|
TRANSPOSE4x4_16 d0, d1, d3, d2
|
||||||
|
|
||||||
|
IDCT_1D d4, d5, d6, d7, d0, d1, d3, d2
|
||||||
|
vld1.32 {d30[1]}, [r0,:32], r2
|
||||||
|
SUMSUB_AB q0, q1, q2, q3
|
||||||
|
|
||||||
|
vrshr.s16 q0, q0, #6
|
||||||
|
vld1.32 {d31[1]}, [r0,:32], r2
|
||||||
|
vrshr.s16 q1, q1, #6
|
||||||
|
vld1.32 {d31[0]}, [r0,:32], r2
|
||||||
|
|
||||||
|
sub r0, r0, r2, lsl #2
|
||||||
|
vaddw.u8 q0, q0, d30
|
||||||
|
vaddw.u8 q1, q1, d31
|
||||||
|
vqmovun.s16 d0, q0
|
||||||
|
vqmovun.s16 d2, q1
|
||||||
|
|
||||||
|
vst1.32 {d0[0]}, [r0,:32], r2
|
||||||
|
vst1.32 {d0[1]}, [r0,:32], r2
|
||||||
|
vst1.32 {d2[1]}, [r0,:32], r2
|
||||||
|
vst1.32 {d2[0]}, [r0,:32], r2
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add8x4_idct_neon, export=0
|
||||||
|
vld1.64 {d0-d3}, [r1,:128]!
|
||||||
|
IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3
|
||||||
|
vld1.64 {d4-d7}, [r1,:128]!
|
||||||
|
IDCT_1D d17, d19, d21, d23, d4, d5, d6, d7
|
||||||
|
SUMSUB_AB q0, q3, q8, q10
|
||||||
|
SUMSUB_AB q1, q2, q9, q11
|
||||||
|
|
||||||
|
TRANSPOSE4x4_16 q0, q1, q2, q3
|
||||||
|
|
||||||
|
IDCT_1D q8, q9, q10, q11, q0, q1, q2, q3
|
||||||
|
SUMSUB_AB q0, q3, q8, q10
|
||||||
|
SUMSUB_AB q1, q2, q9, q11
|
||||||
|
|
||||||
|
vrshr.s16 q0, q0, #6
|
||||||
|
vld1.32 {d28}, [r0,:64], r2
|
||||||
|
vrshr.s16 q1, q1, #6
|
||||||
|
vld1.32 {d29}, [r0,:64], r2
|
||||||
|
vrshr.s16 q2, q2, #6
|
||||||
|
vld1.32 {d30}, [r0,:64], r2
|
||||||
|
vrshr.s16 q3, q3, #6
|
||||||
|
vld1.32 {d31}, [r0,:64], r2
|
||||||
|
|
||||||
|
sub r0, r0, r2, lsl #2
|
||||||
|
vaddw.u8 q0, q0, d28
|
||||||
|
vaddw.u8 q1, q1, d29
|
||||||
|
vaddw.u8 q2, q2, d30
|
||||||
|
vaddw.u8 q3, q3, d31
|
||||||
|
|
||||||
|
vqmovun.s16 d0, q0
|
||||||
|
vqmovun.s16 d1, q1
|
||||||
|
vst1.32 {d0}, [r0,:64], r2
|
||||||
|
vqmovun.s16 d2, q2
|
||||||
|
vst1.32 {d1}, [r0,:64], r2
|
||||||
|
vqmovun.s16 d3, q3
|
||||||
|
vst1.32 {d2}, [r0,:64], r2
|
||||||
|
vst1.32 {d3}, [r0,:64], r2
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add8x8_idct_neon
|
||||||
|
mov r2, #FDEC_STRIDE
|
||||||
|
mov ip, lr
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
mov lr, ip
|
||||||
|
b add8x4_idct_neon
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add16x16_idct_neon
|
||||||
|
mov r2, #FDEC_STRIDE
|
||||||
|
mov ip, lr
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
sub r0, r0, #8*FDEC_STRIDE-8
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
sub r0, r0, #8
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
sub r0, r0, #8*FDEC_STRIDE-8
|
||||||
|
bl add8x4_idct_neon
|
||||||
|
mov lr, ip
|
||||||
|
b add8x4_idct_neon
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
.macro IDCT8_1D type
|
||||||
|
.ifc \type, col
|
||||||
|
vswp d21, d28
|
||||||
|
.endif
|
||||||
|
SUMSUB_AB q0, q1, q8, q12 // a0/a2
|
||||||
|
.ifc \type, row
|
||||||
|
vld1.64 {d28-d31}, [r1,:128]!
|
||||||
|
.else
|
||||||
|
vswp d19, d26
|
||||||
|
.endif
|
||||||
|
SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4
|
||||||
|
.ifc \type, col
|
||||||
|
vswp d23, d30
|
||||||
|
.endif
|
||||||
|
SUMSUB_AB q8, q10, q13, q11
|
||||||
|
SUMSUB_15 q8, q10, q9, q15, q12, q14 // a7/a1
|
||||||
|
SUMSUB_AB q14, q15, q15, q9
|
||||||
|
SUMSUB_15 q15, q14, q13, q11, q12, q9 // a5/a3
|
||||||
|
|
||||||
|
SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5
|
||||||
|
SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7
|
||||||
|
|
||||||
|
SUMSUB_AB q10, q2, q0, q2 // b0/b6
|
||||||
|
SUMSUB_AB q11, q3, q1, q3 // b2/b4
|
||||||
|
|
||||||
|
SUMSUB_AB q8, q15, q10, q15
|
||||||
|
SUMSUB_AB q9, q14, q11, q14
|
||||||
|
SUMSUB_AB q10, q13, q3, q13
|
||||||
|
.ifc \type, row
|
||||||
|
vtrn.16 q8, q9
|
||||||
|
.endif
|
||||||
|
SUMSUB_AB q11, q12, q2, q12
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function add8x8_idct8_neon
|
||||||
|
mov r2, #FDEC_STRIDE
|
||||||
|
vld1.64 {d16-d19}, [r1,:128]!
|
||||||
|
vld1.64 {d20-d23}, [r1,:128]!
|
||||||
|
vld1.64 {d24-d27}, [r1,:128]!
|
||||||
|
|
||||||
|
IDCT8_1D row
|
||||||
|
vtrn.16 q10, q11
|
||||||
|
vtrn.16 q12, q13
|
||||||
|
vtrn.16 q14, q15
|
||||||
|
vtrn.32 q8, q10
|
||||||
|
vtrn.32 q9, q11
|
||||||
|
vtrn.32 q12, q14
|
||||||
|
vtrn.32 q13, q15
|
||||||
|
vswp d17, d24
|
||||||
|
IDCT8_1D col
|
||||||
|
|
||||||
|
vld1.64 {d0}, [r0,:64], r2
|
||||||
|
vrshr.s16 q8, q8, #6
|
||||||
|
vld1.64 {d1}, [r0,:64], r2
|
||||||
|
vrshr.s16 q9, q9, #6
|
||||||
|
vld1.64 {d2}, [r0,:64], r2
|
||||||
|
vrshr.s16 q10, q10, #6
|
||||||
|
vld1.64 {d3}, [r0,:64], r2
|
||||||
|
vrshr.s16 q11, q11, #6
|
||||||
|
vld1.64 {d4}, [r0,:64], r2
|
||||||
|
vrshr.s16 q12, q12, #6
|
||||||
|
vld1.64 {d5}, [r0,:64], r2
|
||||||
|
vrshr.s16 q13, q13, #6
|
||||||
|
vld1.64 {d6}, [r0,:64], r2
|
||||||
|
vrshr.s16 q14, q14, #6
|
||||||
|
vld1.64 {d7}, [r0,:64], r2
|
||||||
|
vrshr.s16 q15, q15, #6
|
||||||
|
sub r0, r0, r2, lsl #3
|
||||||
|
|
||||||
|
vaddw.u8 q8, q8, d0
|
||||||
|
vaddw.u8 q9, q9, d1
|
||||||
|
vaddw.u8 q10, q10, d2
|
||||||
|
vqmovun.s16 d0, q8
|
||||||
|
vqmovun.s16 d1, q9
|
||||||
|
vqmovun.s16 d2, q10
|
||||||
|
vaddw.u8 q11, q11, d3
|
||||||
|
vst1.64 {d0}, [r0,:64], r2
|
||||||
|
vaddw.u8 q12, q12, d4
|
||||||
|
vst1.64 {d1}, [r0,:64], r2
|
||||||
|
vaddw.u8 q13, q13, d5
|
||||||
|
vst1.64 {d2}, [r0,:64], r2
|
||||||
|
vqmovun.s16 d3, q11
|
||||||
|
vqmovun.s16 d4, q12
|
||||||
|
vaddw.u8 q14, q14, d6
|
||||||
|
vaddw.u8 q15, q15, d7
|
||||||
|
vst1.64 {d3}, [r0,:64], r2
|
||||||
|
vqmovun.s16 d5, q13
|
||||||
|
vst1.64 {d4}, [r0,:64], r2
|
||||||
|
vqmovun.s16 d6, q14
|
||||||
|
vqmovun.s16 d7, q15
|
||||||
|
vst1.64 {d5}, [r0,:64], r2
|
||||||
|
vst1.64 {d6}, [r0,:64], r2
|
||||||
|
vst1.64 {d7}, [r0,:64], r2
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function add16x16_idct8_neon
|
||||||
|
mov ip, lr
|
||||||
|
bl X(add8x8_idct8_neon)
|
||||||
|
sub r0, r0, #8*FDEC_STRIDE-8
|
||||||
|
bl X(add8x8_idct8_neon)
|
||||||
|
sub r0, r0, #8
|
||||||
|
bl X(add8x8_idct8_neon)
|
||||||
|
sub r0, r0, #8*FDEC_STRIDE-8
|
||||||
|
mov lr, ip
|
||||||
|
b X(add8x8_idct8_neon)
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
function add8x8_idct_dc_neon
|
||||||
|
mov r2, #FDEC_STRIDE
|
||||||
|
vld1.64 {d16}, [r1,:64]
|
||||||
|
vrshr.s16 d16, d16, #6
|
||||||
|
vld1.64 {d0}, [r0,:64], r2
|
||||||
|
vmov.i16 q15, #0
|
||||||
|
vld1.64 {d1}, [r0,:64], r2
|
||||||
|
vld1.64 {d2}, [r0,:64], r2
|
||||||
|
vdup.16 d20, d16[0]
|
||||||
|
vld1.64 {d3}, [r0,:64], r2
|
||||||
|
vdup.16 d21, d16[1]
|
||||||
|
vld1.64 {d4}, [r0,:64], r2
|
||||||
|
vdup.16 d22, d16[2]
|
||||||
|
vld1.64 {d5}, [r0,:64], r2
|
||||||
|
vdup.16 d23, d16[3]
|
||||||
|
vld1.64 {d6}, [r0,:64], r2
|
||||||
|
vsub.s16 q12, q15, q10
|
||||||
|
vld1.64 {d7}, [r0,:64], r2
|
||||||
|
vsub.s16 q13, q15, q11
|
||||||
|
|
||||||
|
sub r0, r0, #8*FDEC_STRIDE
|
||||||
|
|
||||||
|
vqmovun.s16 d20, q10
|
||||||
|
vqmovun.s16 d22, q11
|
||||||
|
vqmovun.s16 d24, q12
|
||||||
|
vqmovun.s16 d26, q13
|
||||||
|
|
||||||
|
vmov d21, d20
|
||||||
|
vqadd.u8 q0, q0, q10
|
||||||
|
vmov d23, d22
|
||||||
|
vqadd.u8 q1, q1, q10
|
||||||
|
vmov d25, d24
|
||||||
|
vqadd.u8 q2, q2, q11
|
||||||
|
vmov d27, d26
|
||||||
|
vqadd.u8 q3, q3, q11
|
||||||
|
vqsub.u8 q0, q0, q12
|
||||||
|
vqsub.u8 q1, q1, q12
|
||||||
|
vqsub.u8 q2, q2, q13
|
||||||
|
|
||||||
|
vst1.64 {d0}, [r0,:64], r2
|
||||||
|
vqsub.u8 q3, q3, q13
|
||||||
|
vst1.64 {d1}, [r0,:64], r2
|
||||||
|
vst1.64 {d2}, [r0,:64], r2
|
||||||
|
vst1.64 {d3}, [r0,:64], r2
|
||||||
|
vst1.64 {d4}, [r0,:64], r2
|
||||||
|
vst1.64 {d5}, [r0,:64], r2
|
||||||
|
vst1.64 {d6}, [r0,:64], r2
|
||||||
|
vst1.64 {d7}, [r0,:64], r2
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro ADD16x4_IDCT_DC dc
|
||||||
|
vld1.64 {d16-d17}, [r0,:128], r3
|
||||||
|
vld1.64 {d18-d19}, [r0,:128], r3
|
||||||
|
vdup.16 d4, \dc[0]
|
||||||
|
vdup.16 d5, \dc[1]
|
||||||
|
vld1.64 {d20-d21}, [r0,:128], r3
|
||||||
|
vdup.16 d6, \dc[2]
|
||||||
|
vdup.16 d7, \dc[3]
|
||||||
|
vld1.64 {d22-d23}, [r0,:128], r3
|
||||||
|
vsub.s16 q12, q15, q2
|
||||||
|
vsub.s16 q13, q15, q3
|
||||||
|
|
||||||
|
vqmovun.s16 d4, q2
|
||||||
|
vqmovun.s16 d5, q3
|
||||||
|
vqmovun.s16 d6, q12
|
||||||
|
vqmovun.s16 d7, q13
|
||||||
|
|
||||||
|
vqadd.u8 q8, q8, q2
|
||||||
|
vqadd.u8 q9, q9, q2
|
||||||
|
vqadd.u8 q10, q10, q2
|
||||||
|
vqadd.u8 q11, q11, q2
|
||||||
|
|
||||||
|
vqsub.u8 q8, q8, q3
|
||||||
|
vqsub.u8 q9, q9, q3
|
||||||
|
vqsub.u8 q10, q10, q3
|
||||||
|
vst1.64 {d16-d17}, [r2,:128], r3
|
||||||
|
vqsub.u8 q11, q11, q3
|
||||||
|
vst1.64 {d18-d19}, [r2,:128], r3
|
||||||
|
vst1.64 {d20-d21}, [r2,:128], r3
|
||||||
|
vst1.64 {d22-d23}, [r2,:128], r3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function add16x16_idct_dc_neon
|
||||||
|
mov r2, r0
|
||||||
|
mov r3, #FDEC_STRIDE
|
||||||
|
vmov.i16 q15, #0
|
||||||
|
|
||||||
|
vld1.64 {d0-d3}, [r1,:64]
|
||||||
|
vrshr.s16 q0, #6
|
||||||
|
vrshr.s16 q1, #6
|
||||||
|
|
||||||
|
ADD16x4_IDCT_DC d0
|
||||||
|
ADD16x4_IDCT_DC d1
|
||||||
|
ADD16x4_IDCT_DC d2
|
||||||
|
ADD16x4_IDCT_DC d3
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub8x8_dct_dc_neon
|
||||||
|
mov r3, #FENC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.64 {d16}, [r1,:64], r3
|
||||||
|
vld1.64 {d17}, [r2,:64], ip
|
||||||
|
vsubl.u8 q8, d16, d17
|
||||||
|
vld1.64 {d18}, [r1,:64], r3
|
||||||
|
vld1.64 {d19}, [r2,:64], ip
|
||||||
|
vsubl.u8 q9, d18, d19
|
||||||
|
vld1.64 {d20}, [r1,:64], r3
|
||||||
|
vld1.64 {d21}, [r2,:64], ip
|
||||||
|
vsubl.u8 q10, d20, d21
|
||||||
|
vld1.64 {d22}, [r1,:64], r3
|
||||||
|
vadd.s16 q0, q8, q9
|
||||||
|
vld1.64 {d23}, [r2,:64], ip
|
||||||
|
vsubl.u8 q11, d22, d23
|
||||||
|
vld1.64 {d24}, [r1,:64], r3
|
||||||
|
vadd.s16 q0, q0, q10
|
||||||
|
vld1.64 {d25}, [r2,:64], ip
|
||||||
|
vsubl.u8 q12, d24, d25
|
||||||
|
vld1.64 {d26}, [r1,:64], r3
|
||||||
|
vadd.s16 q0, q0, q11
|
||||||
|
vld1.64 {d27}, [r2,:64], ip
|
||||||
|
vsubl.u8 q13, d26, d27
|
||||||
|
vld1.64 {d28}, [r1,:64], r3
|
||||||
|
vld1.64 {d29}, [r2,:64], ip
|
||||||
|
vsubl.u8 q14, d28, d29
|
||||||
|
vld1.64 {d30}, [r1,:64], r3
|
||||||
|
vadd.s16 q1, q12, q13
|
||||||
|
vld1.64 {d31}, [r2,:64], ip
|
||||||
|
vsubl.u8 q15, d30, d31
|
||||||
|
vadd.s16 q1, q1, q14
|
||||||
|
|
||||||
|
vadd.s16 d4, d0, d1
|
||||||
|
vadd.s16 q1, q1, q15
|
||||||
|
vsub.s16 d5, d0, d1
|
||||||
|
vadd.s16 d6, d2, d3
|
||||||
|
vsub.s16 d7, d2, d3
|
||||||
|
vadd.s16 q0, q2, q3
|
||||||
|
vsub.s16 q1, q2, q3
|
||||||
|
|
||||||
|
vpadd.s16 d0, d0, d2
|
||||||
|
vpadd.s16 d1, d1, d3
|
||||||
|
vpadd.s16 d0, d0, d1
|
||||||
|
vst1.64 {d0}, [r0,:64]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function sub8x16_dct_dc_neon
|
||||||
|
mov r3, #FENC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.64 {d16}, [r1,:64], r3
|
||||||
|
vld1.64 {d17}, [r2,:64], ip
|
||||||
|
vsubl.u8 q8, d16, d17
|
||||||
|
vld1.64 {d18}, [r1,:64], r3
|
||||||
|
vld1.64 {d19}, [r2,:64], ip
|
||||||
|
vsubl.u8 q9, d18, d19
|
||||||
|
vld1.64 {d20}, [r1,:64], r3
|
||||||
|
vld1.64 {d21}, [r2,:64], ip
|
||||||
|
vsubl.u8 q10, d20, d21
|
||||||
|
vld1.64 {d22}, [r1,:64], r3
|
||||||
|
vadd.s16 q0, q8, q9
|
||||||
|
vld1.64 {d23}, [r2,:64], ip
|
||||||
|
vsubl.u8 q11, d22, d23
|
||||||
|
vld1.64 {d24}, [r1,:64], r3
|
||||||
|
vadd.s16 q0, q0, q10
|
||||||
|
vld1.64 {d25}, [r2,:64], ip
|
||||||
|
vsubl.u8 q12, d24, d25
|
||||||
|
vld1.64 {d26}, [r1,:64], r3
|
||||||
|
vadd.s16 q0, q0, q11
|
||||||
|
vld1.64 {d27}, [r2,:64], ip
|
||||||
|
vsubl.u8 q13, d26, d27
|
||||||
|
vld1.64 {d28}, [r1,:64], r3
|
||||||
|
vld1.64 {d29}, [r2,:64], ip
|
||||||
|
vsubl.u8 q14, d28, d29
|
||||||
|
vld1.64 {d30}, [r1,:64], r3
|
||||||
|
vadd.s16 q1, q12, q13
|
||||||
|
vld1.64 {d31}, [r2,:64], ip
|
||||||
|
vsubl.u8 q15, d30, d31
|
||||||
|
|
||||||
|
vld1.64 {d16}, [r1,:64], r3
|
||||||
|
vadd.s16 q1, q1, q14
|
||||||
|
vld1.64 {d17}, [r2,:64], ip
|
||||||
|
vadd.s16 q1, q1, q15
|
||||||
|
vld1.64 {d18}, [r1,:64], r3
|
||||||
|
vsubl.u8 q8, d16, d17
|
||||||
|
vld1.64 {d19}, [r2,:64], ip
|
||||||
|
vsubl.u8 q9, d18, d19
|
||||||
|
vld1.64 {d20}, [r1,:64], r3
|
||||||
|
vld1.64 {d21}, [r2,:64], ip
|
||||||
|
vsubl.u8 q10, d20, d21
|
||||||
|
vld1.64 {d22}, [r1,:64], r3
|
||||||
|
vadd.s16 q2, q8, q9
|
||||||
|
vld1.64 {d23}, [r2,:64], ip
|
||||||
|
vsubl.u8 q11, d22, d23
|
||||||
|
vld1.64 {d24}, [r1,:64], r3
|
||||||
|
vadd.s16 q2, q2, q10
|
||||||
|
vld1.64 {d25}, [r2,:64], ip
|
||||||
|
vsubl.u8 q12, d24, d25
|
||||||
|
vld1.64 {d26}, [r1,:64], r3
|
||||||
|
vadd.s16 q2, q2, q11
|
||||||
|
vld1.64 {d27}, [r2,:64], ip
|
||||||
|
vsubl.u8 q13, d26, d27
|
||||||
|
vld1.64 {d28}, [r1,:64], r3
|
||||||
|
vld1.64 {d29}, [r2,:64], ip
|
||||||
|
vsubl.u8 q14, d28, d29
|
||||||
|
vld1.64 {d30}, [r1,:64], r3
|
||||||
|
vadd.s16 q3, q12, q13
|
||||||
|
vld1.64 {d31}, [r2,:64], ip
|
||||||
|
vsubl.u8 q15, d30, d31
|
||||||
|
vadd.s16 q3, q3, q14
|
||||||
|
|
||||||
|
vadd.s16 d16, d0, d1 @ b0
|
||||||
|
vadd.s16 q3, q3, q15
|
||||||
|
vsub.s16 d17, d0, d1 @ b4
|
||||||
|
vadd.s16 d18, d2, d3 @ b1
|
||||||
|
vsub.s16 d19, d2, d3 @ b5
|
||||||
|
vadd.s16 d20, d4, d5 @ b2
|
||||||
|
vsub.s16 d21, d4, d5 @ b6
|
||||||
|
vadd.s16 d22, d6, d7 @ b3
|
||||||
|
vsub.s16 d23, d6, d7 @ b7
|
||||||
|
vadd.s16 q0, q8, q9 @ b0 + b1, b4 + b5; a0, a2
|
||||||
|
vsub.s16 q1, q8, q9 @ b0 - b1, b4 - b5; a4, a6
|
||||||
|
vadd.s16 q2, q10, q11 @ b2 + b3, b6 + b7; a1, a3
|
||||||
|
vsub.s16 q3, q10, q11 @ b2 - b3, b6 - b7; a5, a7
|
||||||
|
|
||||||
|
vadd.s16 q8, q0, q2 @ a0 + a1, a2 + a3
|
||||||
|
vsub.s16 q9, q0, q2 @ a0 - a1, a2 - a3
|
||||||
|
vsub.s16 q10, q1, q3 @ a4 - a5, a6 - a7
|
||||||
|
vadd.s16 q11, q1, q3 @ a4 + a5, a6 + a7
|
||||||
|
|
||||||
|
vpadd.s16 d0, d16, d17
|
||||||
|
vpadd.s16 d1, d18, d19
|
||||||
|
vpadd.s16 d2, d20, d21
|
||||||
|
vpadd.s16 d3, d22, d23
|
||||||
|
vpadd.s16 d0, d0, d1
|
||||||
|
vpadd.s16 d1, d2, d3
|
||||||
|
vst1.64 {q0}, [r0,:64]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
function zigzag_scan_4x4_frame_neon
|
||||||
|
movrel r2, scan4x4_frame
|
||||||
|
vld1.64 {d0-d3}, [r1,:128]
|
||||||
|
vld1.64 {d16-d19}, [r2,:128]
|
||||||
|
vtbl.8 d4, {d0-d1}, d16
|
||||||
|
vtbl.8 d5, {d1-d3}, d17
|
||||||
|
vtbl.8 d6, {d0-d2}, d18
|
||||||
|
vtbl.8 d7, {d2-d3}, d19
|
||||||
|
vst1.64 {d4-d7}, [r0,:128]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
70
common/arm/dct.h
Normal file
70
common/arm/dct.h
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* dct.h: arm transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_ARM_DCT_H
|
||||||
|
#define X264_ARM_DCT_H
|
||||||
|
|
||||||
|
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
|
||||||
|
void x264_dct4x4dc_neon( int16_t d[16] );
|
||||||
|
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
|
||||||
|
void x264_idct4x4dc_neon( int16_t d[16] );
|
||||||
|
|
||||||
|
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
|
||||||
|
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
|
||||||
|
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
|
||||||
|
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
|
||||||
|
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
|
||||||
|
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||||
|
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
|
||||||
|
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
|
||||||
|
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
|
||||||
|
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
|
||||||
|
|
||||||
|
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
|
||||||
|
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
|
||||||
|
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
|
||||||
|
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||||
|
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
|
||||||
|
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
|
||||||
|
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
|
||||||
|
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
|
||||||
|
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
|
||||||
|
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
|
||||||
|
|
||||||
|
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
|
||||||
|
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
|
||||||
|
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
|
||||||
|
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
|
||||||
|
|
||||||
|
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
|
||||||
|
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
|
||||||
|
|
||||||
|
#endif
|
||||||
795
common/arm/deblock-a.S
Normal file
795
common/arm/deblock-a.S
Normal file
@@ -0,0 +1,795 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock.S: arm deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Mans Rullgard <mans@mansr.com>
|
||||||
|
* Martin Storsjo <martin@martin.st>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
.macro h264_loop_filter_start
|
||||||
|
ldr ip, [sp]
|
||||||
|
ldr ip, [ip]
|
||||||
|
vdup.32 d24, ip
|
||||||
|
and ip, ip, ip, lsl #16
|
||||||
|
ands ip, ip, ip, lsl #8
|
||||||
|
bxlt lr
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro align_push_regs
|
||||||
|
and ip, sp, #15
|
||||||
|
add ip, ip, #32
|
||||||
|
sub sp, sp, ip
|
||||||
|
vst1.64 {d12-d15}, [sp,:128]
|
||||||
|
sub sp, sp, #32
|
||||||
|
vst1.64 {d8-d11}, [sp,:128]
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro align_pop_regs
|
||||||
|
vld1.64 {d8-d11}, [sp,:128]!
|
||||||
|
vld1.64 {d12-d15}, [sp,:128], ip
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro h264_loop_filter_luma
|
||||||
|
vdup.8 q11, r2 @ alpha
|
||||||
|
vmovl.u8 q12, d24
|
||||||
|
vabd.u8 q6, q8, q0 @ abs(p0 - q0)
|
||||||
|
vmovl.u16 q12, d24
|
||||||
|
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
|
||||||
|
vsli.16 q12, q12, #8
|
||||||
|
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
|
||||||
|
vsli.32 q12, q12, #16
|
||||||
|
vclt.u8 q6, q6, q11 @ < alpha
|
||||||
|
vdup.8 q11, r3 @ beta
|
||||||
|
vclt.s8 q7, q12, #0
|
||||||
|
vclt.u8 q14, q14, q11 @ < beta
|
||||||
|
vclt.u8 q15, q15, q11 @ < beta
|
||||||
|
vbic q6, q6, q7
|
||||||
|
vabd.u8 q4, q10, q8 @ abs(p2 - p0)
|
||||||
|
vand q6, q6, q14
|
||||||
|
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
|
||||||
|
vclt.u8 q4, q4, q11 @ < beta
|
||||||
|
vand q6, q6, q15
|
||||||
|
vclt.u8 q5, q5, q11 @ < beta
|
||||||
|
vand q4, q4, q6
|
||||||
|
vand q5, q5, q6
|
||||||
|
vand q12, q12, q6
|
||||||
|
vrhadd.u8 q14, q8, q0
|
||||||
|
vsub.i8 q6, q12, q4
|
||||||
|
vqadd.u8 q7, q9, q12
|
||||||
|
vhadd.u8 q10, q10, q14
|
||||||
|
vsub.i8 q6, q6, q5
|
||||||
|
vhadd.u8 q14, q2, q14
|
||||||
|
vmin.u8 q7, q7, q10
|
||||||
|
vqsub.u8 q11, q9, q12
|
||||||
|
vqadd.u8 q2, q1, q12
|
||||||
|
vmax.u8 q7, q7, q11
|
||||||
|
vqsub.u8 q11, q1, q12
|
||||||
|
vmin.u8 q14, q2, q14
|
||||||
|
vmovl.u8 q2, d0
|
||||||
|
vmax.u8 q14, q14, q11
|
||||||
|
vmovl.u8 q10, d1
|
||||||
|
vsubw.u8 q2, q2, d16
|
||||||
|
vsubw.u8 q10, q10, d17
|
||||||
|
vshl.i16 q2, q2, #2
|
||||||
|
vshl.i16 q10, q10, #2
|
||||||
|
vaddw.u8 q2, q2, d18
|
||||||
|
vaddw.u8 q10, q10, d19
|
||||||
|
vsubw.u8 q2, q2, d2
|
||||||
|
vsubw.u8 q10, q10, d3
|
||||||
|
vrshrn.i16 d4, q2, #3
|
||||||
|
vrshrn.i16 d5, q10, #3
|
||||||
|
vbsl q4, q7, q9
|
||||||
|
vbsl q5, q14, q1
|
||||||
|
vneg.s8 q7, q6
|
||||||
|
vmovl.u8 q14, d16
|
||||||
|
vmin.s8 q2, q2, q6
|
||||||
|
vmovl.u8 q6, d17
|
||||||
|
vmax.s8 q2, q2, q7
|
||||||
|
vmovl.u8 q11, d0
|
||||||
|
vmovl.u8 q12, d1
|
||||||
|
vaddw.s8 q14, q14, d4
|
||||||
|
vaddw.s8 q6, q6, d5
|
||||||
|
vsubw.s8 q11, q11, d4
|
||||||
|
vsubw.s8 q12, q12, d5
|
||||||
|
vqmovun.s16 d16, q14
|
||||||
|
vqmovun.s16 d17, q6
|
||||||
|
vqmovun.s16 d0, q11
|
||||||
|
vqmovun.s16 d1, q12
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_luma_neon
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
vld1.64 {d0, d1}, [r0,:128], r1
|
||||||
|
vld1.64 {d2, d3}, [r0,:128], r1
|
||||||
|
vld1.64 {d4, d5}, [r0,:128], r1
|
||||||
|
sub r0, r0, r1, lsl #2
|
||||||
|
sub r0, r0, r1, lsl #1
|
||||||
|
vld1.64 {d20,d21}, [r0,:128], r1
|
||||||
|
vld1.64 {d18,d19}, [r0,:128], r1
|
||||||
|
vld1.64 {d16,d17}, [r0,:128], r1
|
||||||
|
|
||||||
|
align_push_regs
|
||||||
|
|
||||||
|
h264_loop_filter_luma
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #1
|
||||||
|
vst1.64 {d8, d9}, [r0,:128], r1
|
||||||
|
vst1.64 {d16,d17}, [r0,:128], r1
|
||||||
|
vst1.64 {d0, d1}, [r0,:128], r1
|
||||||
|
vst1.64 {d10,d11}, [r0,:128]
|
||||||
|
|
||||||
|
align_pop_regs
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_luma_neon
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub r0, r0, #4
|
||||||
|
vld1.64 {d6}, [r0], r1
|
||||||
|
vld1.64 {d20}, [r0], r1
|
||||||
|
vld1.64 {d18}, [r0], r1
|
||||||
|
vld1.64 {d16}, [r0], r1
|
||||||
|
vld1.64 {d0}, [r0], r1
|
||||||
|
vld1.64 {d2}, [r0], r1
|
||||||
|
vld1.64 {d4}, [r0], r1
|
||||||
|
vld1.64 {d26}, [r0], r1
|
||||||
|
vld1.64 {d7}, [r0], r1
|
||||||
|
vld1.64 {d21}, [r0], r1
|
||||||
|
vld1.64 {d19}, [r0], r1
|
||||||
|
vld1.64 {d17}, [r0], r1
|
||||||
|
vld1.64 {d1}, [r0], r1
|
||||||
|
vld1.64 {d3}, [r0], r1
|
||||||
|
vld1.64 {d5}, [r0], r1
|
||||||
|
vld1.64 {d27}, [r0], r1
|
||||||
|
|
||||||
|
TRANSPOSE8x8 q3, q10, q9, q8, q0, q1, q2, q13
|
||||||
|
|
||||||
|
align_push_regs
|
||||||
|
|
||||||
|
h264_loop_filter_luma
|
||||||
|
|
||||||
|
TRANSPOSE4x4 q4, q8, q0, q5
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #4
|
||||||
|
add r0, r0, #2
|
||||||
|
vst1.32 {d8[0]}, [r0], r1
|
||||||
|
vst1.32 {d16[0]}, [r0], r1
|
||||||
|
vst1.32 {d0[0]}, [r0], r1
|
||||||
|
vst1.32 {d10[0]}, [r0], r1
|
||||||
|
vst1.32 {d8[1]}, [r0], r1
|
||||||
|
vst1.32 {d16[1]}, [r0], r1
|
||||||
|
vst1.32 {d0[1]}, [r0], r1
|
||||||
|
vst1.32 {d10[1]}, [r0], r1
|
||||||
|
vst1.32 {d9[0]}, [r0], r1
|
||||||
|
vst1.32 {d17[0]}, [r0], r1
|
||||||
|
vst1.32 {d1[0]}, [r0], r1
|
||||||
|
vst1.32 {d11[0]}, [r0], r1
|
||||||
|
vst1.32 {d9[1]}, [r0], r1
|
||||||
|
vst1.32 {d17[1]}, [r0], r1
|
||||||
|
vst1.32 {d1[1]}, [r0], r1
|
||||||
|
vst1.32 {d11[1]}, [r0], r1
|
||||||
|
|
||||||
|
align_pop_regs
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_luma_intra
|
||||||
|
vdup.8 q14, r2 @ alpha
|
||||||
|
vabd.u8 q4, q8, q0 @ abs(p0 - q0)
|
||||||
|
vabd.u8 q5, q9, q8 @ abs(p1 - p0)
|
||||||
|
vabd.u8 q6, q1, q0 @ abs(q1 - q0)
|
||||||
|
vdup.8 q15, r3 @ beta
|
||||||
|
vmov.u8 q13, #2
|
||||||
|
vclt.u8 q7, q4, q14 @ < alpha
|
||||||
|
vshr.u8 q14, q14, #2 @ alpha >> 2
|
||||||
|
vclt.u8 q5, q5, q15 @ < beta
|
||||||
|
vadd.u8 q14, q14, q13 @ (alpha >> 2) + 2
|
||||||
|
vand q7, q7, q5
|
||||||
|
vclt.u8 q6, q6, q15 @ < beta
|
||||||
|
vclt.u8 q13, q4, q14 @ < (alpha >> 2) + 2 if_2
|
||||||
|
vand q12, q7, q6 @ if_1
|
||||||
|
vshrn.u16 d28, q12, #4
|
||||||
|
vmov r2, lr, d28
|
||||||
|
orrs r2, r2, lr
|
||||||
|
beq 9f
|
||||||
|
|
||||||
|
sub sp, sp, #32
|
||||||
|
vst1.8 {q12-q13}, [sp,:128]
|
||||||
|
|
||||||
|
vshll.u8 q4, d18, #1 @ 2*p1
|
||||||
|
vshll.u8 q5, d19, #1
|
||||||
|
vaddw.u8 q4, q4, d16 @ 2*p1 + p0
|
||||||
|
vaddw.u8 q5, q5, d17
|
||||||
|
vaddw.u8 q4, q4, d2 @ 2*p1 + p0 + q1
|
||||||
|
vaddw.u8 q5, q5, d3
|
||||||
|
vrshrn.u16 d24, q4, #2
|
||||||
|
vrshrn.u16 d25, q5, #2
|
||||||
|
|
||||||
|
vaddl.u8 q6, d20, d16 @ p2 + p0
|
||||||
|
vaddl.u8 q7, d21, d17
|
||||||
|
vaddw.u8 q6, q6, d0 @ p2 + p0 + q0
|
||||||
|
vaddw.u8 q7, q7, d1
|
||||||
|
vadd.u16 q4, q4, q6 @ p2 + 2*p1 + 2*p0 + q0 + q1
|
||||||
|
vadd.u16 q5, q5, q7
|
||||||
|
vaddw.u8 q4, q4, d0 @ p2 + 2*p1 + 2*p0 + 2*q0 + q1
|
||||||
|
vaddw.u8 q5, q5, d1
|
||||||
|
vrshrn.u16 d26, q4, #3 @ p0'_2
|
||||||
|
vrshrn.u16 d27, q5, #3
|
||||||
|
vaddw.u8 q6, q6, d18 @ p2 + p1 + p0 + q0
|
||||||
|
vaddw.u8 q7, q7, d19
|
||||||
|
vrshrn.u16 d28, q6, #2 @ p1'_2
|
||||||
|
vrshrn.u16 d29, q7, #2
|
||||||
|
vaddl.u8 q4, d22, d20 @ p3 + p2
|
||||||
|
vaddl.u8 q5, d23, d21
|
||||||
|
vshl.u16 q4, q4, #1 @ 2*p3 + 2*p2
|
||||||
|
vshl.u16 q5, q5, #1
|
||||||
|
vadd.u16 q4, q4, q6 @ 2*p3 + 3*p2 + p1 + p0 + q0
|
||||||
|
vadd.u16 q5, q5, q7
|
||||||
|
vrshrn.u16 d30, q4, #3 @ p2'_2
|
||||||
|
vrshrn.u16 d31, q5, #3
|
||||||
|
|
||||||
|
vdup.8 q4, r3 @ beta
|
||||||
|
vabd.u8 q5, q10, q8 @ abs(p2 - p0)
|
||||||
|
vld1.8 {q6-q7}, [sp,:128] @ if_1, if_2
|
||||||
|
vclt.u8 q5, q5, q4 @ < beta if_3
|
||||||
|
|
||||||
|
vand q7, q7, q5 @ if_2 && if_3
|
||||||
|
vmvn q4, q7
|
||||||
|
vand q7, q7, q6 @ if_1 && if_2 && if_3
|
||||||
|
vand q6, q4, q6 @ if_1 && !(if_2 && if_3)
|
||||||
|
|
||||||
|
@ copy p0 to q15 so it can be clobbered
|
||||||
|
vbit q10, q15, q7
|
||||||
|
vmov q15, q8
|
||||||
|
vbit q8, q12, q6
|
||||||
|
|
||||||
|
@ wait for q9 to clobber
|
||||||
|
vshll.u8 q4, d2, #1 @ 2*q1
|
||||||
|
vshll.u8 q5, d3, #1
|
||||||
|
|
||||||
|
vbit q8, q12, q6
|
||||||
|
|
||||||
|
vaddw.u8 q4, q4, d0 @ 2*q1 + q0
|
||||||
|
vaddw.u8 q5, q5, d1
|
||||||
|
|
||||||
|
vbit q8, q13, q7
|
||||||
|
|
||||||
|
vaddw.u8 q4, q4, d18 @ 2*q1 + q0 + p1
|
||||||
|
vaddw.u8 q5, q5, d19
|
||||||
|
|
||||||
|
vbit q9, q14, q7
|
||||||
|
|
||||||
|
vrshrn.u16 d24, q4, #2
|
||||||
|
vrshrn.u16 d25, q5, #2
|
||||||
|
|
||||||
|
vaddl.u8 q6, d4, d0 @ q2 + q0
|
||||||
|
vaddl.u8 q7, d5, d1
|
||||||
|
vaddw.u8 q6, q6, d30 @ q2 + q0 + p0
|
||||||
|
vaddw.u8 q7, q7, d31
|
||||||
|
vadd.u16 q4, q4, q6 @ q2 + 2*q1 + 2*q0 + p0 + p1
|
||||||
|
vadd.u16 q5, q5, q7
|
||||||
|
vaddw.u8 q4, q4, d30 @ q2 + 2*q1 + 2*q0 + 2*p0 + p1
|
||||||
|
vaddw.u8 q5, q5, d31
|
||||||
|
vrshrn.u16 d26, q4, #3 @ q0'_2
|
||||||
|
vrshrn.u16 d27, q5, #3
|
||||||
|
vaddw.u8 q6, q6, d2 @ q2 + q1 + q0 + p0
|
||||||
|
vaddw.u8 q7, q7, d3
|
||||||
|
vrshrn.u16 d28, q6, #2 @ q1'_2
|
||||||
|
vrshrn.u16 d29, q7, #2
|
||||||
|
vaddl.u8 q4, d6, d4 @ q3 + q2
|
||||||
|
vaddl.u8 q5, d7, d5
|
||||||
|
vshl.u16 q4, q4, #1 @ 2*q3 + 2*q2
|
||||||
|
vshl.u16 q5, q5, #1
|
||||||
|
vadd.u16 q4, q4, q6 @ 2*q3 + 3*q2 + q1 + q0 + p0
|
||||||
|
vadd.u16 q5, q5, q7
|
||||||
|
vrshrn.u16 d30, q4, #3 @ q2'_2
|
||||||
|
vrshrn.u16 d31, q5, #3
|
||||||
|
|
||||||
|
vdup.8 q4, r3 @ beta
|
||||||
|
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
|
||||||
|
vld1.8 {q6-q7}, [sp,:128]! @ if_1, if_2
|
||||||
|
vclt.u8 q5, q5, q4 @ < beta if_4
|
||||||
|
|
||||||
|
vand q7, q7, q5 @ if_2 && if_4
|
||||||
|
vmvn q4, q7
|
||||||
|
vand q7, q6, q7 @ if_1 && if_2 && if_4
|
||||||
|
vand q6, q6, q4 @ if_1 && !(if_2 && if_4)
|
||||||
|
|
||||||
|
vbit q0, q12, q6
|
||||||
|
vbit q1, q14, q7
|
||||||
|
vbit q0, q13, q7
|
||||||
|
vbit q2, q15, q7
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_luma_intra_neon
|
||||||
|
push {lr}
|
||||||
|
vld1.64 {d0, d1}, [r0,:128], r1
|
||||||
|
vld1.64 {d2, d3}, [r0,:128], r1
|
||||||
|
vld1.64 {d4, d5}, [r0,:128], r1
|
||||||
|
vld1.64 {d6, d7}, [r0,:128], r1
|
||||||
|
sub r0, r0, r1, lsl #3
|
||||||
|
vld1.64 {d22,d23}, [r0,:128], r1
|
||||||
|
vld1.64 {d20,d21}, [r0,:128], r1
|
||||||
|
vld1.64 {d18,d19}, [r0,:128], r1
|
||||||
|
vld1.64 {d16,d17}, [r0,:128]
|
||||||
|
|
||||||
|
align_push_regs
|
||||||
|
|
||||||
|
h264_loop_filter_luma_intra
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #1
|
||||||
|
vst1.64 {d20,d21}, [r0,:128], r1
|
||||||
|
vst1.64 {d18,d19}, [r0,:128], r1
|
||||||
|
vst1.64 {d16,d17}, [r0,:128], r1
|
||||||
|
vst1.64 {d0, d1}, [r0,:128], r1
|
||||||
|
vst1.64 {d2, d3}, [r0,:128], r1
|
||||||
|
vst1.64 {d4, d5}, [r0,:128]
|
||||||
|
9:
|
||||||
|
align_pop_regs
|
||||||
|
pop {pc}
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_luma_intra_neon
|
||||||
|
push {lr}
|
||||||
|
sub r0, r0, #4
|
||||||
|
vld1.64 {d22}, [r0], r1
|
||||||
|
vld1.64 {d20}, [r0], r1
|
||||||
|
vld1.64 {d18}, [r0], r1
|
||||||
|
vld1.64 {d16}, [r0], r1
|
||||||
|
vld1.64 {d0}, [r0], r1
|
||||||
|
vld1.64 {d2}, [r0], r1
|
||||||
|
vld1.64 {d4}, [r0], r1
|
||||||
|
vld1.64 {d6}, [r0], r1
|
||||||
|
vld1.64 {d23}, [r0], r1
|
||||||
|
vld1.64 {d21}, [r0], r1
|
||||||
|
vld1.64 {d19}, [r0], r1
|
||||||
|
vld1.64 {d17}, [r0], r1
|
||||||
|
vld1.64 {d1}, [r0], r1
|
||||||
|
vld1.64 {d3}, [r0], r1
|
||||||
|
vld1.64 {d5}, [r0], r1
|
||||||
|
vld1.64 {d7}, [r0], r1
|
||||||
|
|
||||||
|
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
|
||||||
|
|
||||||
|
align_push_regs
|
||||||
|
|
||||||
|
h264_loop_filter_luma_intra
|
||||||
|
|
||||||
|
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #4
|
||||||
|
vst1.64 {d22}, [r0], r1
|
||||||
|
vst1.64 {d20}, [r0], r1
|
||||||
|
vst1.64 {d18}, [r0], r1
|
||||||
|
vst1.64 {d16}, [r0], r1
|
||||||
|
vst1.64 {d0}, [r0], r1
|
||||||
|
vst1.64 {d2}, [r0], r1
|
||||||
|
vst1.64 {d4}, [r0], r1
|
||||||
|
vst1.64 {d6}, [r0], r1
|
||||||
|
vst1.64 {d23}, [r0], r1
|
||||||
|
vst1.64 {d21}, [r0], r1
|
||||||
|
vst1.64 {d19}, [r0], r1
|
||||||
|
vst1.64 {d17}, [r0], r1
|
||||||
|
vst1.64 {d1}, [r0], r1
|
||||||
|
vst1.64 {d3}, [r0], r1
|
||||||
|
vst1.64 {d5}, [r0], r1
|
||||||
|
vst1.64 {d7}, [r0], r1
|
||||||
|
9:
|
||||||
|
align_pop_regs
|
||||||
|
pop {pc}
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_chroma
|
||||||
|
vdup.8 q11, r2 // alpha
|
||||||
|
vmovl.u8 q12, d24
|
||||||
|
vabd.u8 q13, q8, q0 // abs(p0 - q0)
|
||||||
|
vabd.u8 q14, q9, q8 // abs(p1 - p0)
|
||||||
|
vsubl.u8 q2, d0, d16
|
||||||
|
vsubl.u8 q3, d1, d17
|
||||||
|
vsli.16 q12, q12, #8
|
||||||
|
vshl.i16 q2, q2, #2
|
||||||
|
vshl.i16 q3, q3, #2
|
||||||
|
vabd.u8 q15, q1, q0 // abs(q1 - q0)
|
||||||
|
vmovl.u8 q12, d24
|
||||||
|
vaddw.u8 q2, q2, d18
|
||||||
|
vaddw.u8 q3, q3, d19
|
||||||
|
vclt.u8 q13, q13, q11 // < alpha
|
||||||
|
vsubw.u8 q2, q2, d2
|
||||||
|
vsubw.u8 q3, q3, d3
|
||||||
|
vsli.16 q12, q12, #8
|
||||||
|
vdup.8 q11, r3 // beta
|
||||||
|
vclt.s8 q10, q12, #0
|
||||||
|
vrshrn.i16 d4, q2, #3
|
||||||
|
vrshrn.i16 d5, q3, #3
|
||||||
|
vclt.u8 q14, q14, q11 // < beta
|
||||||
|
vbic q13, q13, q10
|
||||||
|
vclt.u8 q15, q15, q11 // < beta
|
||||||
|
vand q13, q13, q14
|
||||||
|
vneg.s8 q10, q12
|
||||||
|
vand q13, q13, q15
|
||||||
|
vmin.s8 q2, q2, q12
|
||||||
|
vmovl.u8 q14, d16
|
||||||
|
vand q2, q2, q13
|
||||||
|
vmovl.u8 q15, d17
|
||||||
|
vmax.s8 q2, q2, q10
|
||||||
|
vmovl.u8 q11, d0
|
||||||
|
vmovl.u8 q12, d1
|
||||||
|
vaddw.s8 q14, q14, d4
|
||||||
|
vaddw.s8 q15, q15, d5
|
||||||
|
vsubw.s8 q11, q11, d4
|
||||||
|
vsubw.s8 q12, q12, d5
|
||||||
|
vqmovun.s16 d16, q14
|
||||||
|
vqmovun.s16 d17, q15
|
||||||
|
vqmovun.s16 d0, q11
|
||||||
|
vqmovun.s16 d1, q12
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_chroma_neon
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #1
|
||||||
|
vld1.8 {d18,d19}, [r0,:128], r1
|
||||||
|
vld1.8 {d16,d17}, [r0,:128], r1
|
||||||
|
vld1.8 {d0, d1}, [r0,:128], r1
|
||||||
|
vld1.8 {d2, d3}, [r0,:128]
|
||||||
|
|
||||||
|
h264_loop_filter_chroma
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #1
|
||||||
|
vst1.8 {d16,d17}, [r0,:128], r1
|
||||||
|
vst1.8 {d0, d1}, [r0,:128], r1
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_neon
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub r0, r0, #4
|
||||||
|
deblock_h_chroma:
|
||||||
|
vld1.8 {d18}, [r0], r1
|
||||||
|
vld1.8 {d16}, [r0], r1
|
||||||
|
vld1.8 {d0}, [r0], r1
|
||||||
|
vld1.8 {d2}, [r0], r1
|
||||||
|
vld1.8 {d19}, [r0], r1
|
||||||
|
vld1.8 {d17}, [r0], r1
|
||||||
|
vld1.8 {d1}, [r0], r1
|
||||||
|
vld1.8 {d3}, [r0], r1
|
||||||
|
|
||||||
|
TRANSPOSE4x4_16 q9, q8, q0, q1
|
||||||
|
|
||||||
|
h264_loop_filter_chroma
|
||||||
|
|
||||||
|
vtrn.16 q8, q0
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #3
|
||||||
|
add r0, r0, #2
|
||||||
|
vst1.32 {d16[0]}, [r0], r1
|
||||||
|
vst1.32 {d0[0]}, [r0], r1
|
||||||
|
vst1.32 {d16[1]}, [r0], r1
|
||||||
|
vst1.32 {d0[1]}, [r0], r1
|
||||||
|
vst1.32 {d17[0]}, [r0], r1
|
||||||
|
vst1.32 {d1[0]}, [r0], r1
|
||||||
|
vst1.32 {d17[1]}, [r0], r1
|
||||||
|
vst1.32 {d1[1]}, [r0], r1
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_422_neon
|
||||||
|
h264_loop_filter_start
|
||||||
|
push {lr}
|
||||||
|
sub r0, r0, #4
|
||||||
|
add r1, r1, r1
|
||||||
|
bl deblock_h_chroma
|
||||||
|
ldr ip, [sp, #4]
|
||||||
|
ldr ip, [ip]
|
||||||
|
vdup.32 d24, ip
|
||||||
|
sub r0, r0, r1, lsl #3
|
||||||
|
add r0, r0, r1, lsr #1
|
||||||
|
sub r0, r0, #2
|
||||||
|
pop {lr}
|
||||||
|
b deblock_h_chroma
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_chroma8
|
||||||
|
vdup.8 d22, r2 @ alpha
|
||||||
|
vmovl.u8 q12, d24
|
||||||
|
vabd.u8 d26, d16, d0 @ abs(p0 - q0)
|
||||||
|
vabd.u8 d28, d18, d16 @ abs(p1 - p0)
|
||||||
|
vsubl.u8 q2, d0, d16
|
||||||
|
vsli.16 d24, d24, #8
|
||||||
|
vshl.i16 q2, q2, #2
|
||||||
|
vabd.u8 d30, d2, d0 @ abs(q1 - q0)
|
||||||
|
vaddw.u8 q2, q2, d18
|
||||||
|
vclt.u8 d26, d26, d22 @ < alpha
|
||||||
|
vsubw.u8 q2, q2, d2
|
||||||
|
vdup.8 d22, r3 @ beta
|
||||||
|
vclt.s8 d20, d24, #0
|
||||||
|
vrshrn.i16 d4, q2, #3
|
||||||
|
vclt.u8 d28, d28, d22 @ < beta
|
||||||
|
vbic d26, d26, d20
|
||||||
|
vclt.u8 d30, d30, d22 @ < beta
|
||||||
|
vand d26, d26, d28
|
||||||
|
vneg.s8 d20, d24
|
||||||
|
vand d26, d26, d30
|
||||||
|
vmin.s8 d4, d4, d24
|
||||||
|
vmovl.u8 q14, d16
|
||||||
|
vand d4, d4, d26
|
||||||
|
vmax.s8 d4, d4, d20
|
||||||
|
vmovl.u8 q11, d0
|
||||||
|
vaddw.s8 q14, q14, d4
|
||||||
|
vsubw.s8 q11, q11, d4
|
||||||
|
vqmovun.s16 d16, q14
|
||||||
|
vqmovun.s16 d0, q11
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_h_chroma_mbaff_neon
|
||||||
|
h264_loop_filter_start
|
||||||
|
|
||||||
|
sub r0, r0, #4
|
||||||
|
vld1.8 {d18}, [r0], r1
|
||||||
|
vld1.8 {d16}, [r0], r1
|
||||||
|
vld1.8 {d0}, [r0], r1
|
||||||
|
vld1.8 {d2}, [r0], r1
|
||||||
|
|
||||||
|
TRANSPOSE4x4_16 d18, d16, d0, d2
|
||||||
|
|
||||||
|
h264_loop_filter_chroma8
|
||||||
|
|
||||||
|
vtrn.16 d16, d0
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #2
|
||||||
|
add r0, r0, #2
|
||||||
|
vst1.32 {d16[0]}, [r0], r1
|
||||||
|
vst1.32 {d0[0]}, [r0], r1
|
||||||
|
vst1.32 {d16[1]}, [r0], r1
|
||||||
|
vst1.32 {d0[1]}, [r0]
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro h264_loop_filter_chroma_intra, width=16
|
||||||
|
vdup.8 q11, r2 @ alpha
|
||||||
|
vabd.u8 q13, q8, q0 @ abs(p0 - q0)
|
||||||
|
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
|
||||||
|
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
|
||||||
|
vclt.u8 q13, q13, q11 @ < alpha
|
||||||
|
vdup.8 q11, r3 @ beta
|
||||||
|
vclt.u8 q14, q14, q11 @ < beta
|
||||||
|
vclt.u8 q15, q15, q11 @ < beta
|
||||||
|
vand q13, q13, q14
|
||||||
|
vand q13, q13, q15
|
||||||
|
|
||||||
|
vshll.u8 q14, d18, #1
|
||||||
|
vshll.u8 q2, d2, #1
|
||||||
|
.ifc \width, 16
|
||||||
|
vshll.u8 q15, d19, #1
|
||||||
|
vshll.u8 q3, d3, #1
|
||||||
|
vaddl.u8 q12, d17, d3
|
||||||
|
vaddl.u8 q10, d1, d19
|
||||||
|
.endif
|
||||||
|
vaddl.u8 q11, d16, d2
|
||||||
|
vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1
|
||||||
|
vadd.u16 q14, q14, q11
|
||||||
|
vadd.u16 q2, q2, q1
|
||||||
|
.ifc \width, 16
|
||||||
|
vadd.u16 q15, q15, q12
|
||||||
|
vadd.u16 q3, q3, q10
|
||||||
|
.endif
|
||||||
|
vqrshrn.u16 d28, q14, #2
|
||||||
|
vqrshrn.u16 d4, q2, #2
|
||||||
|
.ifc \width, 16
|
||||||
|
vqrshrn.u16 d29, q15, #2
|
||||||
|
vqrshrn.u16 d5, q3, #2
|
||||||
|
.endif
|
||||||
|
vbit q8, q14, q13
|
||||||
|
vbit q0, q2, q13
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function deblock_v_chroma_intra_neon
|
||||||
|
sub r0, r0, r1, lsl #1
|
||||||
|
vld2.8 {d18,d19}, [r0,:128], r1
|
||||||
|
vld2.8 {d16,d17}, [r0,:128], r1
|
||||||
|
vld2.8 {d0, d1}, [r0,:128], r1
|
||||||
|
vld2.8 {d2, d3}, [r0,:128]
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #1
|
||||||
|
vst2.8 {d16,d17}, [r0,:128], r1
|
||||||
|
vst2.8 {d0, d1}, [r0,:128], r1
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_intra_neon
|
||||||
|
sub r0, r0, #4
|
||||||
|
vld1.8 {d18}, [r0], r1
|
||||||
|
vld1.8 {d16}, [r0], r1
|
||||||
|
vld1.8 {d0}, [r0], r1
|
||||||
|
vld1.8 {d2}, [r0], r1
|
||||||
|
vld1.8 {d19}, [r0], r1
|
||||||
|
vld1.8 {d17}, [r0], r1
|
||||||
|
vld1.8 {d1}, [r0], r1
|
||||||
|
vld1.8 {d3}, [r0], r1
|
||||||
|
|
||||||
|
TRANSPOSE4x4_16 q9, q8, q0, q1
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra
|
||||||
|
|
||||||
|
vtrn.16 q8, q0
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #3
|
||||||
|
add r0, r0, #2
|
||||||
|
vst1.32 {d16[0]}, [r0], r1
|
||||||
|
vst1.32 {d0[0]}, [r0], r1
|
||||||
|
vst1.32 {d16[1]}, [r0], r1
|
||||||
|
vst1.32 {d0[1]}, [r0], r1
|
||||||
|
vst1.32 {d17[0]}, [r0], r1
|
||||||
|
vst1.32 {d1[0]}, [r0], r1
|
||||||
|
vst1.32 {d17[1]}, [r0], r1
|
||||||
|
vst1.32 {d1[1]}, [r0], r1
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_422_intra_neon
|
||||||
|
push {lr}
|
||||||
|
bl X(deblock_h_chroma_intra_neon)
|
||||||
|
add r0, r0, #2
|
||||||
|
pop {lr}
|
||||||
|
b X(deblock_h_chroma_intra_neon)
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_h_chroma_intra_mbaff_neon
|
||||||
|
sub r0, r0, #4
|
||||||
|
vld1.8 {d18}, [r0], r1
|
||||||
|
vld1.8 {d16}, [r0], r1
|
||||||
|
vld1.8 {d0}, [r0], r1
|
||||||
|
vld1.8 {d2}, [r0], r1
|
||||||
|
|
||||||
|
TRANSPOSE4x4_16 d18, d16, d0, d2
|
||||||
|
|
||||||
|
h264_loop_filter_chroma_intra width=8
|
||||||
|
|
||||||
|
vtrn.16 d16, d0
|
||||||
|
|
||||||
|
sub r0, r0, r1, lsl #2
|
||||||
|
add r0, r0, #2
|
||||||
|
vst1.32 {d16[0]}, [r0], r1
|
||||||
|
vst1.32 {d0[0]}, [r0], r1
|
||||||
|
vst1.32 {d16[1]}, [r0], r1
|
||||||
|
vst1.32 {d0[1]}, [r0]
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function deblock_strength_neon
|
||||||
|
ldr ip, [sp]
|
||||||
|
vmov.i8 q8, #0
|
||||||
|
lsl ip, ip, #8
|
||||||
|
add r3, r3, #32
|
||||||
|
sub ip, ip, #(1<<8)-3
|
||||||
|
vmov.i8 q9, #0
|
||||||
|
vdup.16 q10, ip
|
||||||
|
ldr ip, [sp, #4]
|
||||||
|
|
||||||
|
lists:
|
||||||
|
@ load bytes ref
|
||||||
|
vld1.8 {d31}, [r1]!
|
||||||
|
add r2, r2, #16
|
||||||
|
vld1.8 {q1}, [r1]!
|
||||||
|
vmov.i8 q0, #0
|
||||||
|
vld1.8 {q2}, [r1]!
|
||||||
|
vext.8 q3, q0, q1, #15
|
||||||
|
vext.8 q0, q0, q2, #15
|
||||||
|
vuzp.32 q1, q2
|
||||||
|
vuzp.32 q3, q0
|
||||||
|
vext.8 q1, q15, q2, #12
|
||||||
|
|
||||||
|
veor q0, q0, q2
|
||||||
|
veor q1, q1, q2
|
||||||
|
vorr q8, q8, q0
|
||||||
|
vorr q9, q9, q1
|
||||||
|
|
||||||
|
vld1.16 {q11}, [r2,:128]! @ mv + 0x10
|
||||||
|
vld1.16 {q3}, [r2,:128]! @ mv + 0x20
|
||||||
|
vld1.16 {q12}, [r2,:128]! @ mv + 0x30
|
||||||
|
vld1.16 {q2}, [r2,:128]! @ mv + 0x40
|
||||||
|
vld1.16 {q13}, [r2,:128]! @ mv + 0x50
|
||||||
|
vext.8 q3, q3, q12, #12
|
||||||
|
vext.8 q2, q2, q13, #12
|
||||||
|
vabd.s16 q0, q12, q3
|
||||||
|
vld1.16 {q3}, [r2,:128]! @ mv + 0x60
|
||||||
|
vabd.s16 q1, q13, q2
|
||||||
|
vld1.16 {q14}, [r2,:128]! @ mv + 0x70
|
||||||
|
vqmovn.u16 d0, q0
|
||||||
|
vld1.16 {q2}, [r2,:128]! @ mv + 0x80
|
||||||
|
vld1.16 {q15}, [r2,:128]! @ mv + 0x90
|
||||||
|
vqmovn.u16 d1, q1
|
||||||
|
vext.8 q3, q3, q14, #12
|
||||||
|
vext.8 q2, q2, q15, #12
|
||||||
|
vabd.s16 q3, q14, q3
|
||||||
|
vabd.s16 q2, q15, q2
|
||||||
|
vqmovn.u16 d2, q3
|
||||||
|
vqmovn.u16 d3, q2
|
||||||
|
|
||||||
|
vqsub.u8 q0, q0, q10
|
||||||
|
vqsub.u8 q1, q1, q10
|
||||||
|
vqmovn.u16 d0, q0
|
||||||
|
vqmovn.u16 d1, q1
|
||||||
|
|
||||||
|
vabd.s16 q1, q12, q13
|
||||||
|
vorr q8, q8, q0
|
||||||
|
|
||||||
|
vabd.s16 q0, q11, q12
|
||||||
|
vabd.s16 q2, q13, q14
|
||||||
|
vabd.s16 q3, q14, q15
|
||||||
|
vqmovn.u16 d0, q0
|
||||||
|
vqmovn.u16 d1, q1
|
||||||
|
vqmovn.u16 d2, q2
|
||||||
|
vqmovn.u16 d3, q3
|
||||||
|
|
||||||
|
vqsub.u8 q0, q0, q10
|
||||||
|
vqsub.u8 q1, q1, q10
|
||||||
|
vqmovn.u16 d0, q0
|
||||||
|
vqmovn.u16 d1, q1
|
||||||
|
subs ip, ip, #1
|
||||||
|
vorr q9, q9, q0
|
||||||
|
beq lists
|
||||||
|
|
||||||
|
mov ip, #-32
|
||||||
|
@ load bytes nnz
|
||||||
|
vld1.8 {d31}, [r0]!
|
||||||
|
vld1.8 {q1}, [r0]!
|
||||||
|
vmov.i8 q0, #0
|
||||||
|
vld1.8 {q2}, [r0]
|
||||||
|
vext.8 q3, q0, q1, #15
|
||||||
|
vext.8 q0, q0, q2, #15
|
||||||
|
vuzp.32 q1, q2
|
||||||
|
vuzp.32 q3, q0
|
||||||
|
vext.8 q1, q15, q2, #12
|
||||||
|
|
||||||
|
vorr q0, q0, q2
|
||||||
|
vorr q1, q1, q2
|
||||||
|
vmov.u8 q10, #1
|
||||||
|
vmin.u8 q0, q0, q10
|
||||||
|
vmin.u8 q1, q1, q10
|
||||||
|
vmin.u8 q8, q8, q10 @ mv ? 1 : 0
|
||||||
|
vmin.u8 q9, q9, q10
|
||||||
|
vadd.u8 q0, q0, q0 @ nnz ? 2 : 0
|
||||||
|
vadd.u8 q1, q1, q1
|
||||||
|
vmax.u8 q8, q8, q0
|
||||||
|
vmax.u8 q9, q9, q1
|
||||||
|
vzip.16 d16, d17
|
||||||
|
vst1.8 {q9}, [r3,:128], ip @ bs[1]
|
||||||
|
vtrn.8 d16, d17
|
||||||
|
vtrn.32 d16, d17
|
||||||
|
|
||||||
|
vst1.8 {q8}, [r3,:128] @ bs[0]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
58
common/arm/deblock.h
Normal file
58
common/arm/deblock.h
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock.h: arm deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2017-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_ARM_DEBLOCK_H
|
||||||
|
#define X264_ARM_DEBLOCK_H
|
||||||
|
|
||||||
|
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
|
||||||
|
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
|
||||||
|
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
|
||||||
|
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
|
||||||
|
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
|
||||||
|
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||||
|
int mvy_limit, int bframe );
|
||||||
|
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
|
||||||
|
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
|
||||||
|
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
|
||||||
|
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
|
||||||
|
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
|
||||||
|
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
|
||||||
|
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
|
||||||
|
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
|
||||||
|
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
|
||||||
|
#endif
|
||||||
1938
common/arm/mc-a.S
Normal file
1938
common/arm/mc-a.S
Normal file
File diff suppressed because it is too large
Load Diff
366
common/arm/mc-c.c
Normal file
366
common/arm/mc-c.c
Normal file
@@ -0,0 +1,366 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc-c.c: arm motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "mc.h"
|
||||||
|
|
||||||
|
#define x264_prefetch_ref_arm x264_template(prefetch_ref_arm)
|
||||||
|
void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
|
||||||
|
#define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm)
|
||||||
|
void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
|
||||||
|
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
|
||||||
|
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
|
||||||
|
void x264_memzero_aligned_neon( void *dst, size_t n );
|
||||||
|
|
||||||
|
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
|
||||||
|
void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
|
||||||
|
void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
|
||||||
|
void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
|
||||||
|
void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
|
||||||
|
void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
|
||||||
|
void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
|
||||||
|
void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
|
||||||
|
void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
|
||||||
|
void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
|
||||||
|
void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
|
||||||
|
void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
|
||||||
|
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
|
||||||
|
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
|
||||||
|
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
|
||||||
|
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
|
||||||
|
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
|
||||||
|
pixel *dstv, intptr_t i_dstv,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
|
||||||
|
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
|
||||||
|
pixel *dstb, intptr_t i_dstb,
|
||||||
|
pixel *dstc, intptr_t i_dstc,
|
||||||
|
pixel *src, intptr_t i_src, int pw, int w, int h );
|
||||||
|
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
|
||||||
|
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *srcu, intptr_t i_srcu,
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||||
|
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
|
||||||
|
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
|
||||||
|
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
|
||||||
|
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||||
|
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
|
||||||
|
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
|
||||||
|
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
|
||||||
|
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
|
||||||
|
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
|
||||||
|
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
|
||||||
|
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
|
||||||
|
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
|
||||||
|
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
|
||||||
|
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
|
||||||
|
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
|
||||||
|
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
|
||||||
|
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
#define MC_WEIGHT(func)\
|
||||||
|
void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||||
|
\
|
||||||
|
static weight_fn_t mc##func##_wtab_neon[6] =\
|
||||||
|
{\
|
||||||
|
x264_mc_weight_w4##func##_neon,\
|
||||||
|
x264_mc_weight_w4##func##_neon,\
|
||||||
|
x264_mc_weight_w8##func##_neon,\
|
||||||
|
x264_mc_weight_w16##func##_neon,\
|
||||||
|
x264_mc_weight_w16##func##_neon,\
|
||||||
|
x264_mc_weight_w20##func##_neon,\
|
||||||
|
};
|
||||||
|
|
||||||
|
MC_WEIGHT()
|
||||||
|
MC_WEIGHT(_nodenom)
|
||||||
|
MC_WEIGHT(_offsetadd)
|
||||||
|
MC_WEIGHT(_offsetsub)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
|
||||||
|
void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
|
||||||
|
void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
|
||||||
|
void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon)
|
||||||
|
void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
|
||||||
|
void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
|
||||||
|
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
|
||||||
|
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
|
||||||
|
|
||||||
|
#define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon)
|
||||||
|
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
|
||||||
|
#define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon)
|
||||||
|
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
|
||||||
|
#define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon)
|
||||||
|
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
|
||||||
|
|
||||||
|
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
|
||||||
|
void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
|
||||||
|
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
|
||||||
|
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
|
||||||
|
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
|
||||||
|
void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
|
||||||
|
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
|
||||||
|
void x264_integral_init8v_neon( uint16_t *, intptr_t );
|
||||||
|
|
||||||
|
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
|
||||||
|
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
|
||||||
|
|
||||||
|
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
|
||||||
|
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
|
||||||
|
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
|
||||||
|
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
|
||||||
|
{
|
||||||
|
if( w->i_scale == 1<<w->i_denom )
|
||||||
|
{
|
||||||
|
if( w->i_offset < 0 )
|
||||||
|
{
|
||||||
|
w->weightfn = mc_offsetsub_wtab_neon;
|
||||||
|
w->cachea[0] = -w->i_offset;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
w->weightfn = mc_offsetadd_wtab_neon;
|
||||||
|
w->cachea[0] = w->i_offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( !w->i_denom )
|
||||||
|
w->weightfn = mc_nodenom_wtab_neon;
|
||||||
|
else
|
||||||
|
w->weightfn = mc_wtab_neon;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_pixel_avg2_w4_neon,
|
||||||
|
x264_pixel_avg2_w8_neon,
|
||||||
|
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
|
||||||
|
x264_pixel_avg2_w16_neon,
|
||||||
|
x264_pixel_avg2_w20_neon,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w4_neon,
|
||||||
|
x264_mc_copy_w8_neon,
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w16_neon,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
|
||||||
|
uint8_t *src[4], intptr_t i_src_stride,
|
||||||
|
int mvx, int mvy,
|
||||||
|
int i_width, int i_height, const x264_weight_t *weight )
|
||||||
|
{
|
||||||
|
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||||
|
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||||
|
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||||
|
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||||
|
src1 += i_src_stride;
|
||||||
|
|
||||||
|
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||||
|
{
|
||||||
|
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||||
|
pixel_avg_wtab_neon[i_width>>2](
|
||||||
|
dst, i_dst_stride, src1, i_src_stride,
|
||||||
|
src2, i_height );
|
||||||
|
if( weight->weightfn )
|
||||||
|
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
|
||||||
|
}
|
||||||
|
else if( weight->weightfn )
|
||||||
|
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||||
|
else
|
||||||
|
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
|
||||||
|
uint8_t *src[4], intptr_t i_src_stride,
|
||||||
|
int mvx, int mvy,
|
||||||
|
int i_width, int i_height, const x264_weight_t *weight )
|
||||||
|
{
|
||||||
|
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||||
|
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||||
|
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||||
|
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||||
|
src1 += i_src_stride;
|
||||||
|
|
||||||
|
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||||
|
{
|
||||||
|
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||||
|
pixel_avg_wtab_neon[i_width>>2](
|
||||||
|
dst, *i_dst_stride, src1, i_src_stride,
|
||||||
|
src2, i_height );
|
||||||
|
if( weight->weightfn )
|
||||||
|
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
else if( weight->weightfn )
|
||||||
|
{
|
||||||
|
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*i_dst_stride = i_src_stride;
|
||||||
|
return src1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
|
||||||
|
intptr_t stride, int width, int height, int16_t *buf )
|
||||||
|
{
|
||||||
|
intptr_t realign = (intptr_t)src & 15;
|
||||||
|
src -= realign;
|
||||||
|
dstv -= realign;
|
||||||
|
dstc -= realign;
|
||||||
|
dsth -= realign;
|
||||||
|
width += realign;
|
||||||
|
while( height-- )
|
||||||
|
{
|
||||||
|
x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
|
||||||
|
x264_hpel_filter_c_neon( dstc, buf+8, width );
|
||||||
|
x264_hpel_filter_h_neon( dsth, src, width );
|
||||||
|
dsth += stride;
|
||||||
|
dstv += stride;
|
||||||
|
dstc += stride;
|
||||||
|
src += stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PLANE_COPY(16, neon)
|
||||||
|
PLANE_COPY_SWAP(16, neon)
|
||||||
|
PLANE_INTERLEAVE(neon)
|
||||||
|
PROPAGATE_LIST(neon)
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
|
||||||
|
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_ARMV6) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
|
||||||
|
pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
|
||||||
|
pf->prefetch_ref = x264_prefetch_ref_arm;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
|
||||||
|
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
|
||||||
|
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
|
||||||
|
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
|
||||||
|
|
||||||
|
pf->plane_copy = plane_copy_neon;
|
||||||
|
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
|
||||||
|
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
|
||||||
|
pf->plane_copy_interleave = plane_copy_interleave_neon;
|
||||||
|
pf->plane_copy_swap = plane_copy_swap_neon;
|
||||||
|
|
||||||
|
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
|
||||||
|
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
|
||||||
|
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
|
||||||
|
|
||||||
|
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
|
||||||
|
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
|
||||||
|
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
|
||||||
|
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
|
||||||
|
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
|
||||||
|
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
|
||||||
|
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
|
||||||
|
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
|
||||||
|
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
|
||||||
|
|
||||||
|
pf->weight = mc_wtab_neon;
|
||||||
|
pf->offsetadd = mc_offsetadd_wtab_neon;
|
||||||
|
pf->offsetsub = mc_offsetsub_wtab_neon;
|
||||||
|
pf->weight_cache = weight_cache_neon;
|
||||||
|
|
||||||
|
pf->mc_chroma = x264_mc_chroma_neon;
|
||||||
|
pf->mc_luma = mc_luma_neon;
|
||||||
|
pf->get_ref = get_ref_neon;
|
||||||
|
pf->hpel_filter = hpel_filter_neon;
|
||||||
|
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
|
||||||
|
|
||||||
|
pf->integral_init4h = x264_integral_init4h_neon;
|
||||||
|
pf->integral_init8h = x264_integral_init8h_neon;
|
||||||
|
pf->integral_init4v = x264_integral_init4v_neon;
|
||||||
|
pf->integral_init8v = x264_integral_init8v_neon;
|
||||||
|
|
||||||
|
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
|
||||||
|
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
|
||||||
|
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
|
||||||
|
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
|
||||||
|
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
|
||||||
|
#ifndef SYS_MACOSX
|
||||||
|
pf->memcpy_aligned = x264_memcpy_aligned_neon;
|
||||||
|
#endif
|
||||||
|
pf->memzero_aligned = x264_memzero_aligned_neon;
|
||||||
|
}
|
||||||
32
common/arm/mc.h
Normal file
32
common/arm/mc.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc.h: arm motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_ARM_MC_H
|
||||||
|
#define X264_ARM_MC_H
|
||||||
|
|
||||||
|
#define x264_mc_init_arm x264_template(mc_init_arm)
|
||||||
|
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf );
|
||||||
|
|
||||||
|
#endif
|
||||||
1535
common/arm/pixel-a.S
Normal file
1535
common/arm/pixel-a.S
Normal file
File diff suppressed because it is too large
Load Diff
160
common/arm/pixel.h
Normal file
160
common/arm/pixel.h
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* pixel.h: arm pixel metrics
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_ARM_PIXEL_H
|
||||||
|
#define X264_ARM_PIXEL_H
|
||||||
|
|
||||||
|
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
|
||||||
|
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
|
||||||
|
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
|
||||||
|
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
|
||||||
|
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
|
||||||
|
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
|
||||||
|
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
|
||||||
|
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
|
||||||
|
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
|
||||||
|
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
|
||||||
|
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
|
||||||
|
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
|
||||||
|
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
|
||||||
|
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
|
||||||
|
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
|
||||||
|
#define x264_pixel_sad_4x4_armv6 x264_template(pixel_sad_4x4_armv6)
|
||||||
|
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
|
||||||
|
#define x264_pixel_sad_4x8_armv6 x264_template(pixel_sad_4x8_armv6)
|
||||||
|
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
|
||||||
|
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
|
||||||
|
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
|
||||||
|
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
|
||||||
|
#define x264_pixel_sad_aligned_16x16_neon x264_template(pixel_sad_aligned_16x16_neon)
|
||||||
|
#define x264_pixel_sad_aligned_16x16_neon_dual x264_template(pixel_sad_aligned_16x16_neon_dual)
|
||||||
|
#define x264_pixel_sad_aligned_16x8_neon x264_template(pixel_sad_aligned_16x8_neon)
|
||||||
|
#define x264_pixel_sad_aligned_16x8_neon_dual x264_template(pixel_sad_aligned_16x8_neon_dual)
|
||||||
|
#define x264_pixel_sad_aligned_4x4_neon x264_template(pixel_sad_aligned_4x4_neon)
|
||||||
|
#define x264_pixel_sad_aligned_4x8_neon x264_template(pixel_sad_aligned_4x8_neon)
|
||||||
|
#define x264_pixel_sad_aligned_8x16_neon x264_template(pixel_sad_aligned_8x16_neon)
|
||||||
|
#define x264_pixel_sad_aligned_8x16_neon_dual x264_template(pixel_sad_aligned_8x16_neon_dual)
|
||||||
|
#define x264_pixel_sad_aligned_8x4_neon x264_template(pixel_sad_aligned_8x4_neon)
|
||||||
|
#define x264_pixel_sad_aligned_8x4_neon_dual x264_template(pixel_sad_aligned_8x4_neon_dual)
|
||||||
|
#define x264_pixel_sad_aligned_8x8_neon x264_template(pixel_sad_aligned_8x8_neon)
|
||||||
|
#define x264_pixel_sad_aligned_8x8_neon_dual x264_template(pixel_sad_aligned_8x8_neon_dual)
|
||||||
|
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
|
||||||
|
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
|
||||||
|
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
|
||||||
|
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
|
||||||
|
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
|
||||||
|
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
|
||||||
|
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
|
||||||
|
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
|
||||||
|
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
|
||||||
|
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
|
||||||
|
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
|
||||||
|
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
|
||||||
|
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
|
||||||
|
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
|
||||||
|
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
|
||||||
|
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
|
||||||
|
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
|
||||||
|
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
|
||||||
|
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
|
||||||
|
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
|
||||||
|
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
|
||||||
|
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
|
||||||
|
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
|
||||||
|
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
|
||||||
|
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
|
||||||
|
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
|
||||||
|
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
|
||||||
|
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
|
||||||
|
#define DECL_PIXELS( ret, name, suffix, args ) \
|
||||||
|
ret x264_pixel_##name##_16x16_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_16x8_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_8x16_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_8x8_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_8x4_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_4x8_##suffix args;\
|
||||||
|
ret x264_pixel_##name##_4x4_##suffix args;\
|
||||||
|
|
||||||
|
#define DECL_X1( name, suffix ) \
|
||||||
|
DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
|
||||||
|
|
||||||
|
#define DECL_X4( name, suffix ) \
|
||||||
|
DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
|
||||||
|
DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
|
||||||
|
|
||||||
|
int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||||
|
int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||||
|
|
||||||
|
DECL_X1( sad, neon )
|
||||||
|
DECL_X1( sad_aligned, neon )
|
||||||
|
DECL_X1( sad_aligned, neon_dual )
|
||||||
|
DECL_X4( sad, neon )
|
||||||
|
DECL_X1( satd, neon )
|
||||||
|
DECL_X1( ssd, neon )
|
||||||
|
|
||||||
|
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
|
||||||
|
void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
|
||||||
|
|
||||||
|
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
|
||||||
|
int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
|
||||||
|
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
|
||||||
|
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
|
||||||
|
uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||||
|
|
||||||
|
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
|
||||||
|
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
|
||||||
|
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
|
||||||
|
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
|
||||||
|
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
|
||||||
|
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
|
||||||
|
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
|
||||||
|
|
||||||
|
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
|
||||||
|
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
|
||||||
|
|
||||||
|
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
|
||||||
|
void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
|
||||||
|
const uint8_t *, intptr_t,
|
||||||
|
int sums[2][4] );
|
||||||
|
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
|
||||||
|
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
|
||||||
|
|
||||||
|
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
|
||||||
|
int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#endif
|
||||||
808
common/arm/predict-a.S
Normal file
808
common/arm/predict-a.S
Normal file
@@ -0,0 +1,808 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict.S: arm intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Mans Rullgard <mans@mansr.com>
|
||||||
|
* Martin Storsjo <martin@martin.st>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
const p16weight, align=4
|
||||||
|
.short 1,2,3,4,5,6,7,8
|
||||||
|
endconst
|
||||||
|
|
||||||
|
.text
|
||||||
|
|
||||||
|
.macro ldcol.8 rd, rs, rt, n=8, hi=0
|
||||||
|
.if \n == 8 || \hi == 0
|
||||||
|
vld1.8 {\rd[0]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd[1]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd[2]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd[3]}, [\rs], \rt
|
||||||
|
.endif
|
||||||
|
.if \n == 8 || \hi == 1
|
||||||
|
vld1.8 {\rd[4]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd[5]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd[6]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd[7]}, [\rs], \rt
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ldcol.16 rd1, rd2, rs, rt, ru
|
||||||
|
add \ru, \rs, \rt, lsl #3
|
||||||
|
vld1.8 {\rd1[0]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[0]}, [\ru], \rt
|
||||||
|
vld1.8 {\rd1[1]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[1]}, [\ru], \rt
|
||||||
|
vld1.8 {\rd1[2]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[2]}, [\ru], \rt
|
||||||
|
vld1.8 {\rd1[3]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[3]}, [\ru], \rt
|
||||||
|
vld1.8 {\rd1[4]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[4]}, [\ru], \rt
|
||||||
|
vld1.8 {\rd1[5]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[5]}, [\ru], \rt
|
||||||
|
vld1.8 {\rd1[6]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[6]}, [\ru], \rt
|
||||||
|
vld1.8 {\rd1[7]}, [\rs], \rt
|
||||||
|
vld1.8 {\rd2[7]}, [\ru], \rt
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro add16x8 dq, dl, dh, rl, rh
|
||||||
|
vaddl.u8 \dq, \rl, \rh
|
||||||
|
vadd.u16 \dl, \dl, \dh
|
||||||
|
vpadd.u16 \dl, \dl, \dl
|
||||||
|
vpadd.u16 \dl, \dl, \dl
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
// because gcc doesn't believe in using the free shift in add
|
||||||
|
function predict_4x4_h_armv6
|
||||||
|
ldrb r1, [r0, #0*FDEC_STRIDE-1]
|
||||||
|
ldrb r2, [r0, #1*FDEC_STRIDE-1]
|
||||||
|
ldrb r3, [r0, #2*FDEC_STRIDE-1]
|
||||||
|
ldrb ip, [r0, #3*FDEC_STRIDE-1]
|
||||||
|
add r1, r1, r1, lsl #8
|
||||||
|
add r2, r2, r2, lsl #8
|
||||||
|
add r3, r3, r3, lsl #8
|
||||||
|
add ip, ip, ip, lsl #8
|
||||||
|
add r1, r1, r1, lsl #16
|
||||||
|
str r1, [r0, #0*FDEC_STRIDE]
|
||||||
|
add r2, r2, r2, lsl #16
|
||||||
|
str r2, [r0, #1*FDEC_STRIDE]
|
||||||
|
add r3, r3, r3, lsl #16
|
||||||
|
str r3, [r0, #2*FDEC_STRIDE]
|
||||||
|
add ip, ip, ip, lsl #16
|
||||||
|
str ip, [r0, #3*FDEC_STRIDE]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_v_armv6
|
||||||
|
ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
|
||||||
|
str r1, [r0, #0 + 0 * FDEC_STRIDE]
|
||||||
|
str r1, [r0, #0 + 1 * FDEC_STRIDE]
|
||||||
|
str r1, [r0, #0 + 2 * FDEC_STRIDE]
|
||||||
|
str r1, [r0, #0 + 3 * FDEC_STRIDE]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_dc_armv6
|
||||||
|
mov ip, #0
|
||||||
|
ldr r1, [r0, #-FDEC_STRIDE]
|
||||||
|
ldrb r2, [r0, #0*FDEC_STRIDE-1]
|
||||||
|
ldrb r3, [r0, #1*FDEC_STRIDE-1]
|
||||||
|
usad8 r1, r1, ip
|
||||||
|
add r2, r2, #4
|
||||||
|
ldrb ip, [r0, #2*FDEC_STRIDE-1]
|
||||||
|
add r2, r2, r3
|
||||||
|
ldrb r3, [r0, #3*FDEC_STRIDE-1]
|
||||||
|
add r2, r2, ip
|
||||||
|
add r2, r2, r3
|
||||||
|
add r1, r1, r2
|
||||||
|
lsr r1, r1, #3
|
||||||
|
add r1, r1, r1, lsl #8
|
||||||
|
add r1, r1, r1, lsl #16
|
||||||
|
str r1, [r0, #0*FDEC_STRIDE]
|
||||||
|
str r1, [r0, #1*FDEC_STRIDE]
|
||||||
|
str r1, [r0, #2*FDEC_STRIDE]
|
||||||
|
str r1, [r0, #3*FDEC_STRIDE]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_dc_top_neon
|
||||||
|
mov r12, #FDEC_STRIDE
|
||||||
|
sub r1, r0, #FDEC_STRIDE
|
||||||
|
vld1.32 d1[], [r1,:32]
|
||||||
|
vpaddl.u8 d1, d1
|
||||||
|
vpadd.u16 d1, d1, d1
|
||||||
|
vrshr.u16 d1, d1, #2
|
||||||
|
vdup.8 d1, d1[0]
|
||||||
|
vst1.32 d1[0], [r0,:32], r12
|
||||||
|
vst1.32 d1[0], [r0,:32], r12
|
||||||
|
vst1.32 d1[0], [r0,:32], r12
|
||||||
|
vst1.32 d1[0], [r0,:32], r12
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
|
||||||
|
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
|
||||||
|
uhadd8 \a1, \a1, \c1
|
||||||
|
uhadd8 \a2, \a2, \c2
|
||||||
|
uhadd8 \c1, \a1, \b1
|
||||||
|
uhadd8 \c2, \a2, \b2
|
||||||
|
eor \a1, \a1, \b1
|
||||||
|
eor \a2, \a2, \b2
|
||||||
|
and \a1, \a1, \pb_1
|
||||||
|
and \a2, \a2, \pb_1
|
||||||
|
uadd8 \a1, \a1, \c1
|
||||||
|
uadd8 \a2, \a2, \c2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
function predict_4x4_ddr_armv6
|
||||||
|
ldr r1, [r0, # -FDEC_STRIDE]
|
||||||
|
ldrb r2, [r0, # -FDEC_STRIDE-1]
|
||||||
|
ldrb r3, [r0, #0*FDEC_STRIDE-1]
|
||||||
|
push {r4-r6,lr}
|
||||||
|
add r2, r2, r1, lsl #8
|
||||||
|
ldrb r4, [r0, #1*FDEC_STRIDE-1]
|
||||||
|
add r3, r3, r2, lsl #8
|
||||||
|
ldrb r5, [r0, #2*FDEC_STRIDE-1]
|
||||||
|
ldrb r6, [r0, #3*FDEC_STRIDE-1]
|
||||||
|
add r4, r4, r3, lsl #8
|
||||||
|
add r5, r5, r4, lsl #8
|
||||||
|
add r6, r6, r5, lsl #8
|
||||||
|
ldr ip, =0x01010101
|
||||||
|
PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
|
||||||
|
str r1, [r0, #0*FDEC_STRIDE]
|
||||||
|
lsl r2, r1, #8
|
||||||
|
lsl r3, r1, #16
|
||||||
|
lsl r4, r4, #8
|
||||||
|
lsl r5, r1, #24
|
||||||
|
add r2, r2, r4, lsr #24
|
||||||
|
str r2, [r0, #1*FDEC_STRIDE]
|
||||||
|
add r3, r3, r4, lsr #16
|
||||||
|
str r3, [r0, #2*FDEC_STRIDE]
|
||||||
|
add r5, r5, r4, lsr #8
|
||||||
|
str r5, [r0, #3*FDEC_STRIDE]
|
||||||
|
pop {r4-r6,pc}
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_4x4_ddl_neon
|
||||||
|
sub r0, #FDEC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.64 {d0}, [r0], ip
|
||||||
|
vdup.8 d3, d0[7]
|
||||||
|
vext.8 d1, d0, d0, #1
|
||||||
|
vext.8 d2, d0, d3, #2
|
||||||
|
vhadd.u8 d0, d0, d2
|
||||||
|
vrhadd.u8 d0, d0, d1
|
||||||
|
vst1.32 {d0[0]}, [r0,:32], ip
|
||||||
|
vext.8 d1, d0, d0, #1
|
||||||
|
vext.8 d2, d0, d0, #2
|
||||||
|
vst1.32 {d1[0]}, [r0,:32], ip
|
||||||
|
vext.8 d3, d0, d0, #3
|
||||||
|
vst1.32 {d2[0]}, [r0,:32], ip
|
||||||
|
vst1.32 {d3[0]}, [r0,:32], ip
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_dc_neon
|
||||||
|
mov ip, #0
|
||||||
|
ldrd r2, r3, [r1, #8]
|
||||||
|
push {r4-r5,lr}
|
||||||
|
ldrd r4, r5, [r1, #16]
|
||||||
|
lsl r3, r3, #8
|
||||||
|
ldrb lr, [r1, #7]
|
||||||
|
usad8 r2, r2, ip
|
||||||
|
usad8 r3, r3, ip
|
||||||
|
usada8 r2, r4, ip, r2
|
||||||
|
add lr, lr, #8
|
||||||
|
usada8 r3, r5, ip, r3
|
||||||
|
add r2, r2, lr
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
add r2, r2, r3
|
||||||
|
lsr r2, r2, #4
|
||||||
|
|
||||||
|
vdup.8 d0, r2
|
||||||
|
.rept 8
|
||||||
|
vst1.64 {d0}, [r0,:64], ip
|
||||||
|
.endr
|
||||||
|
pop {r4-r5,pc}
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_h_neon
|
||||||
|
add r1, r1, #7
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.64 {d16}, [r1]
|
||||||
|
vdup.8 d0, d16[7]
|
||||||
|
vdup.8 d1, d16[6]
|
||||||
|
vst1.64 {d0}, [r0,:64], ip
|
||||||
|
vdup.8 d2, d16[5]
|
||||||
|
vst1.64 {d1}, [r0,:64], ip
|
||||||
|
vdup.8 d3, d16[4]
|
||||||
|
vst1.64 {d2}, [r0,:64], ip
|
||||||
|
vdup.8 d4, d16[3]
|
||||||
|
vst1.64 {d3}, [r0,:64], ip
|
||||||
|
vdup.8 d5, d16[2]
|
||||||
|
vst1.64 {d4}, [r0,:64], ip
|
||||||
|
vdup.8 d6, d16[1]
|
||||||
|
vst1.64 {d5}, [r0,:64], ip
|
||||||
|
vdup.8 d7, d16[0]
|
||||||
|
vst1.64 {d6}, [r0,:64], ip
|
||||||
|
vst1.64 {d7}, [r0,:64], ip
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_v_neon
|
||||||
|
add r1, r1, #16
|
||||||
|
mov r12, #FDEC_STRIDE
|
||||||
|
vld1.8 {d0}, [r1,:64]
|
||||||
|
.rept 8
|
||||||
|
vst1.8 {d0}, [r0,:64], r12
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_ddl_neon
|
||||||
|
add r1, #16
|
||||||
|
vld1.8 {d0, d1}, [r1,:128]
|
||||||
|
vmov.i8 q3, #0
|
||||||
|
vrev64.8 d2, d1
|
||||||
|
vext.8 q8, q3, q0, #15
|
||||||
|
vext.8 q2, q0, q1, #1
|
||||||
|
vhadd.u8 q8, q2
|
||||||
|
mov r12, #FDEC_STRIDE
|
||||||
|
vrhadd.u8 q0, q8
|
||||||
|
vext.8 d2, d0, d1, #1
|
||||||
|
vext.8 d3, d0, d1, #2
|
||||||
|
vst1.8 d2, [r0,:64], r12
|
||||||
|
vext.8 d2, d0, d1, #3
|
||||||
|
vst1.8 d3, [r0,:64], r12
|
||||||
|
vext.8 d3, d0, d1, #4
|
||||||
|
vst1.8 d2, [r0,:64], r12
|
||||||
|
vext.8 d2, d0, d1, #5
|
||||||
|
vst1.8 d3, [r0,:64], r12
|
||||||
|
vext.8 d3, d0, d1, #6
|
||||||
|
vst1.8 d2, [r0,:64], r12
|
||||||
|
vext.8 d2, d0, d1, #7
|
||||||
|
vst1.8 d3, [r0,:64], r12
|
||||||
|
vst1.8 d2, [r0,:64], r12
|
||||||
|
vst1.8 d1, [r0,:64], r12
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_ddr_neon
|
||||||
|
vld1.8 {d0-d3}, [r1,:128]
|
||||||
|
vext.8 q2, q0, q1, #7
|
||||||
|
vext.8 q3, q0, q1, #9
|
||||||
|
|
||||||
|
vhadd.u8 q2, q2, q3
|
||||||
|
vrhadd.u8 d0, d1, d4
|
||||||
|
vrhadd.u8 d1, d2, d5
|
||||||
|
|
||||||
|
add r0, #7*FDEC_STRIDE
|
||||||
|
mov r12, #-1*FDEC_STRIDE
|
||||||
|
|
||||||
|
vext.8 d2, d0, d1, #1
|
||||||
|
vst1.8 {d0}, [r0,:64], r12
|
||||||
|
vext.8 d4, d0, d1, #2
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vext.8 d5, d0, d1, #3
|
||||||
|
vst1.8 {d4}, [r0,:64], r12
|
||||||
|
vext.8 d4, d0, d1, #4
|
||||||
|
vst1.8 {d5}, [r0,:64], r12
|
||||||
|
vext.8 d5, d0, d1, #5
|
||||||
|
vst1.8 {d4}, [r0,:64], r12
|
||||||
|
vext.8 d4, d0, d1, #6
|
||||||
|
vst1.8 {d5}, [r0,:64], r12
|
||||||
|
vext.8 d5, d0, d1, #7
|
||||||
|
vst1.8 {d4}, [r0,:64], r12
|
||||||
|
vst1.8 {d5}, [r0,:64], r12
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_vl_neon
|
||||||
|
add r1, #16
|
||||||
|
mov r12, #FDEC_STRIDE
|
||||||
|
|
||||||
|
vld1.8 {d0, d1}, [r1,:128]
|
||||||
|
vext.8 q1, q1, q0, #15
|
||||||
|
vext.8 q2, q0, q2, #1
|
||||||
|
|
||||||
|
vrhadd.u8 q3, q0, q2
|
||||||
|
|
||||||
|
vhadd.u8 q1, q1, q2
|
||||||
|
vrhadd.u8 q0, q0, q1
|
||||||
|
|
||||||
|
vext.8 d2, d0, d1, #1
|
||||||
|
vst1.8 {d6}, [r0,:64], r12
|
||||||
|
vext.8 d3, d6, d7, #1
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vext.8 d2, d0, d1, #2
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
vext.8 d3, d6, d7, #2
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vext.8 d2, d0, d1, #3
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
vext.8 d3, d6, d7, #3
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vext.8 d2, d0, d1, #4
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_vr_neon
|
||||||
|
add r1, #8
|
||||||
|
mov r12, #FDEC_STRIDE
|
||||||
|
vld1.8 {d4,d5}, [r1,:64]
|
||||||
|
|
||||||
|
vext.8 q1, q2, q2, #14
|
||||||
|
vext.8 q0, q2, q2, #15
|
||||||
|
|
||||||
|
vhadd.u8 q3, q2, q1
|
||||||
|
vrhadd.u8 q2, q2, q0
|
||||||
|
vrhadd.u8 q0, q0, q3
|
||||||
|
|
||||||
|
vmov d2, d0
|
||||||
|
|
||||||
|
vst1.8 {d5}, [r0,:64], r12
|
||||||
|
vuzp.8 d2, d0
|
||||||
|
vst1.8 {d1}, [r0,:64], r12
|
||||||
|
vext.8 d6, d0, d5, #7
|
||||||
|
vext.8 d3, d2, d1, #7
|
||||||
|
vst1.8 {d6}, [r0,:64], r12
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
vext.8 d6, d0, d5, #6
|
||||||
|
vext.8 d3, d2, d1, #6
|
||||||
|
vst1.8 {d6}, [r0,:64], r12
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
vext.8 d6, d0, d5, #5
|
||||||
|
vext.8 d3, d2, d1, #5
|
||||||
|
vst1.8 {d6}, [r0,:64], r12
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_hd_neon
|
||||||
|
mov r12, #FDEC_STRIDE
|
||||||
|
add r1, #7
|
||||||
|
|
||||||
|
vld1.8 {d2,d3}, [r1]
|
||||||
|
vext.8 q3, q1, q1, #1
|
||||||
|
vext.8 q2, q1, q1, #2
|
||||||
|
|
||||||
|
vrhadd.u8 q8, q1, q3
|
||||||
|
|
||||||
|
vhadd.u8 q1, q2
|
||||||
|
vrhadd.u8 q0, q1, q3
|
||||||
|
|
||||||
|
vzip.8 d16, d0
|
||||||
|
|
||||||
|
vext.8 d2, d0, d1, #6
|
||||||
|
vext.8 d3, d0, d1, #4
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vext.8 d2, d0, d1, #2
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vext.8 d2, d16, d0, #6
|
||||||
|
vst1.8 {d0}, [r0,:64], r12
|
||||||
|
vext.8 d3, d16, d0, #4
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vext.8 d2, d16, d0, #2
|
||||||
|
vst1.8 {d3}, [r0,:64], r12
|
||||||
|
vst1.8 {d2}, [r0,:64], r12
|
||||||
|
vst1.8 {d16}, [r0,:64], r12
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8_hu_neon
|
||||||
|
mov r12, #FDEC_STRIDE
|
||||||
|
add r1, #7
|
||||||
|
vld1.8 {d7}, [r1]
|
||||||
|
vdup.8 d6, d7[0]
|
||||||
|
vrev64.8 d7, d7
|
||||||
|
|
||||||
|
vext.8 d4, d7, d6, #2
|
||||||
|
vext.8 d2, d7, d6, #1
|
||||||
|
|
||||||
|
vhadd.u8 d16, d7, d4
|
||||||
|
vrhadd.u8 d0, d2, d7
|
||||||
|
vrhadd.u8 d1, d16, d2
|
||||||
|
|
||||||
|
vzip.8 d0, d1
|
||||||
|
|
||||||
|
vdup.16 q1, d1[3]
|
||||||
|
|
||||||
|
vext.8 q2, q0, q1, #2
|
||||||
|
vext.8 q3, q0, q1, #4
|
||||||
|
vext.8 q8, q0, q1, #6
|
||||||
|
vst1.8 {d0}, [r0,:64], r12
|
||||||
|
vst1.8 {d4}, [r0,:64], r12
|
||||||
|
vst1.8 {d6}, [r0,:64], r12
|
||||||
|
vst1.8 {d16}, [r0,:64], r12
|
||||||
|
|
||||||
|
vst1.8 {d1}, [r0,:64], r12
|
||||||
|
vst1.8 {d5}, [r0,:64], r12
|
||||||
|
vst1.8 {d7}, [r0,:64], r12
|
||||||
|
vst1.8 {d17}, [r0,:64]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_dc_top_neon
|
||||||
|
sub r2, r0, #FDEC_STRIDE
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
vld1.8 {d0}, [r2,:64]
|
||||||
|
vpaddl.u8 d0, d0
|
||||||
|
vpadd.u16 d0, d0, d0
|
||||||
|
vrshrn.u16 d0, q0, #2
|
||||||
|
vdup.8 d1, d0[1]
|
||||||
|
vdup.8 d0, d0[0]
|
||||||
|
vtrn.32 d0, d1
|
||||||
|
b pred8x8_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_dc_left_neon
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
sub r2, r0, #1
|
||||||
|
ldcol.8 d0, r2, r1
|
||||||
|
vpaddl.u8 d0, d0
|
||||||
|
vpadd.u16 d0, d0, d0
|
||||||
|
vrshrn.u16 d0, q0, #2
|
||||||
|
vdup.8 d1, d0[1]
|
||||||
|
vdup.8 d0, d0[0]
|
||||||
|
b pred8x8_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_dc_neon
|
||||||
|
sub r2, r0, #FDEC_STRIDE
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
vld1.8 {d0}, [r2,:64]
|
||||||
|
sub r2, r0, #1
|
||||||
|
ldcol.8 d1, r2, r1
|
||||||
|
vtrn.32 d0, d1
|
||||||
|
vpaddl.u8 q0, q0
|
||||||
|
vpadd.u16 d0, d0, d1
|
||||||
|
vpadd.u16 d1, d0, d0
|
||||||
|
vrshrn.u16 d2, q0, #3
|
||||||
|
vrshrn.u16 d3, q0, #2
|
||||||
|
vdup.8 d0, d2[4]
|
||||||
|
vdup.8 d1, d3[3]
|
||||||
|
vdup.8 d4, d3[2]
|
||||||
|
vdup.8 d5, d2[5]
|
||||||
|
vtrn.32 q0, q2
|
||||||
|
pred8x8_dc_end:
|
||||||
|
add r2, r0, r1, lsl #2
|
||||||
|
.rept 4
|
||||||
|
vst1.8 {d0}, [r0,:64], r1
|
||||||
|
vst1.8 {d1}, [r2,:64], r1
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_h_neon
|
||||||
|
sub r1, r0, #1
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
.rept 4
|
||||||
|
vld1.8 {d0[]}, [r1], ip
|
||||||
|
vld1.8 {d2[]}, [r1], ip
|
||||||
|
vst1.64 {d0}, [r0,:64], ip
|
||||||
|
vst1.64 {d2}, [r0,:64], ip
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_v_neon
|
||||||
|
sub r0, r0, #FDEC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.64 {d0}, [r0,:64], ip
|
||||||
|
.rept 8
|
||||||
|
vst1.64 {d0}, [r0,:64], ip
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x8c_p_neon
|
||||||
|
sub r3, r0, #FDEC_STRIDE
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
add r2, r3, #4
|
||||||
|
sub r3, r3, #1
|
||||||
|
vld1.32 {d0[0]}, [r3]
|
||||||
|
vld1.32 {d2[0]}, [r2,:32], r1
|
||||||
|
ldcol.8 d0, r3, r1, 4, hi=1
|
||||||
|
add r3, r3, r1
|
||||||
|
ldcol.8 d3, r3, r1, 4
|
||||||
|
vaddl.u8 q8, d2, d3
|
||||||
|
vrev32.8 d0, d0
|
||||||
|
vtrn.32 d2, d3
|
||||||
|
vsubl.u8 q2, d2, d0
|
||||||
|
movrel r3, p16weight
|
||||||
|
vld1.16 {q0}, [r3,:128]
|
||||||
|
vmul.s16 d4, d4, d0
|
||||||
|
vmul.s16 d5, d5, d0
|
||||||
|
vpadd.i16 d4, d4, d5
|
||||||
|
vpaddl.s16 d4, d4
|
||||||
|
vshl.i32 d5, d4, #4
|
||||||
|
vadd.s32 d4, d4, d5
|
||||||
|
vrshrn.s32 d4, q2, #5
|
||||||
|
mov r3, #0
|
||||||
|
vtrn.16 d4, d5
|
||||||
|
vadd.i16 d2, d4, d5
|
||||||
|
vshl.i16 d3, d2, #2
|
||||||
|
vrev64.16 d16, d16
|
||||||
|
vsub.i16 d3, d3, d2
|
||||||
|
vadd.i16 d16, d16, d0
|
||||||
|
vshl.i16 d2, d16, #4
|
||||||
|
vsub.i16 d2, d2, d3
|
||||||
|
vext.16 q0, q0, q0, #7
|
||||||
|
vmov.16 d0[0], r3
|
||||||
|
vmul.i16 q0, q0, d4[0]
|
||||||
|
vdup.16 q1, d2[0]
|
||||||
|
vdup.16 q3, d5[0]
|
||||||
|
vadd.i16 q1, q1, q0
|
||||||
|
mov r3, #8
|
||||||
|
1:
|
||||||
|
vqshrun.s16 d0, q1, #5
|
||||||
|
vadd.i16 q1, q1, q3
|
||||||
|
vst1.8 {d0}, [r0,:64], r1
|
||||||
|
subs r3, r3, #1
|
||||||
|
bne 1b
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
function predict_8x16c_dc_top_neon
|
||||||
|
sub r2, r0, #FDEC_STRIDE
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
vld1.8 {d0}, [r2,:64]
|
||||||
|
vpaddl.u8 d0, d0
|
||||||
|
vpadd.u16 d0, d0, d0
|
||||||
|
vrshrn.u16 d0, q0, #2
|
||||||
|
vdup.8 d1, d0[1]
|
||||||
|
vdup.8 d0, d0[0]
|
||||||
|
vtrn.32 d0, d1
|
||||||
|
|
||||||
|
add r2, r0, r1, lsl #2
|
||||||
|
.rept 4
|
||||||
|
vst1.8 {d0}, [r0,:64], r1
|
||||||
|
vst1.8 {d1}, [r2,:64], r1
|
||||||
|
.endr
|
||||||
|
add r2, r2, r1, lsl #2
|
||||||
|
add r0, r0, r1, lsl #2
|
||||||
|
.rept 4
|
||||||
|
vst1.8 {d0}, [r0,:64], r1
|
||||||
|
vst1.8 {d1}, [r2,:64], r1
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x16c_h_neon
|
||||||
|
sub r1, r0, #1
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
.rept 8
|
||||||
|
vld1.8 {d0[]}, [r1], ip
|
||||||
|
vld1.8 {d2[]}, [r1], ip
|
||||||
|
vst1.64 {d0}, [r0,:64], ip
|
||||||
|
vst1.64 {d2}, [r0,:64], ip
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_8x16c_p_neon
|
||||||
|
sub r3, r0, #FDEC_STRIDE
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
add r2, r3, #4
|
||||||
|
sub r3, r3, #1
|
||||||
|
vld1.32 {d0[0]}, [r3]
|
||||||
|
vld1.32 {d2[0]}, [r2,:32], r1
|
||||||
|
ldcol.8 d1, r3, r1
|
||||||
|
add r3, r3, r1
|
||||||
|
ldcol.8 d3, r3, r1
|
||||||
|
vrev64.32 d16, d3
|
||||||
|
vaddl.u8 q8, d2, d16
|
||||||
|
vrev32.8 d0, d0
|
||||||
|
vsubl.u8 q2, d2, d0
|
||||||
|
vrev64.8 d1, d1
|
||||||
|
vsubl.u8 q3, d3, d1
|
||||||
|
movrel r3, p16weight
|
||||||
|
vld1.16 {q0}, [r3,:128]
|
||||||
|
vmul.s16 d4, d4, d0
|
||||||
|
vmul.s16 q3, q3, q0
|
||||||
|
vpadd.i16 d4, d4, d5
|
||||||
|
vpadd.i16 d6, d6, d7
|
||||||
|
vpaddl.s16 d4, d4 @ d4[0] = H
|
||||||
|
vpaddl.s16 d6, d6
|
||||||
|
vpadd.s32 d6, d6 @ d6[0] = V
|
||||||
|
vshl.i32 d5, d4, #4
|
||||||
|
vadd.s32 d4, d4, d5 @ d4[0] = 17*H
|
||||||
|
vshl.i32 d7, d6, #2
|
||||||
|
vrshrn.s32 d4, q2, #5 @ d4[0] = b
|
||||||
|
vadd.s32 d6, d6, d7 @ d6[0] = 5*V
|
||||||
|
vrshrn.s32 d6, q3, #6 @ d6[0] = c
|
||||||
|
mov r3, #0
|
||||||
|
vshl.i16 d3, d4, #2
|
||||||
|
vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
|
||||||
|
vshl.i16 d2, d6, #3
|
||||||
|
vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
|
||||||
|
vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
|
||||||
|
vrev64.16 d16, d16
|
||||||
|
vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
|
||||||
|
vshl.i16 d2, d16, #4 @ d3[0] = a + 16
|
||||||
|
vsub.i16 d2, d2, d3 @ i00
|
||||||
|
vext.16 q0, q0, q0, #7
|
||||||
|
vmov.16 d0[0], r3
|
||||||
|
vmul.i16 q0, q0, d4[0]
|
||||||
|
vdup.16 q1, d2[0]
|
||||||
|
vdup.16 q3, d6[0]
|
||||||
|
vadd.i16 q1, q1, q0
|
||||||
|
mov r3, #16
|
||||||
|
1:
|
||||||
|
vqshrun.s16 d0, q1, #5
|
||||||
|
vadd.i16 q1, q1, q3
|
||||||
|
vst1.8 {d0}, [r0,:64], r1
|
||||||
|
subs r3, r3, #1
|
||||||
|
bne 1b
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
|
||||||
|
function predict_16x16_dc_top_neon
|
||||||
|
sub r2, r0, #FDEC_STRIDE
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
vld1.8 {q0}, [r2,:128]
|
||||||
|
add16x8 q0, d0, d1, d0, d1
|
||||||
|
vrshrn.u16 d0, q0, #4
|
||||||
|
vdup.8 q0, d0[0]
|
||||||
|
b pred16x16_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_dc_left_neon
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
sub r2, r0, #1
|
||||||
|
ldcol.8 d0, r2, r1
|
||||||
|
ldcol.8 d1, r2, r1
|
||||||
|
add16x8 q0, d0, d1, d0, d1
|
||||||
|
vrshrn.u16 d0, q0, #4
|
||||||
|
vdup.8 q0, d0[0]
|
||||||
|
b pred16x16_dc_end
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_dc_neon
|
||||||
|
sub r3, r0, #FDEC_STRIDE
|
||||||
|
sub r0, r0, #1
|
||||||
|
vld1.64 {d0-d1}, [r3,:128]
|
||||||
|
ldrb ip, [r0], #FDEC_STRIDE
|
||||||
|
vaddl.u8 q0, d0, d1
|
||||||
|
ldrb r1, [r0], #FDEC_STRIDE
|
||||||
|
vadd.u16 d0, d0, d1
|
||||||
|
vpadd.u16 d0, d0, d0
|
||||||
|
vpadd.u16 d0, d0, d0
|
||||||
|
.rept 4
|
||||||
|
ldrb r2, [r0], #FDEC_STRIDE
|
||||||
|
add ip, ip, r1
|
||||||
|
ldrb r3, [r0], #FDEC_STRIDE
|
||||||
|
add ip, ip, r2
|
||||||
|
ldrb r1, [r0], #FDEC_STRIDE
|
||||||
|
add ip, ip, r3
|
||||||
|
.endr
|
||||||
|
ldrb r2, [r0], #FDEC_STRIDE
|
||||||
|
add ip, ip, r1
|
||||||
|
ldrb r3, [r0], #FDEC_STRIDE
|
||||||
|
add ip, ip, r2
|
||||||
|
|
||||||
|
sub r0, r0, #FDEC_STRIDE*16
|
||||||
|
add ip, ip, r3
|
||||||
|
vdup.16 d1, ip
|
||||||
|
vadd.u16 d0, d0, d1
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
add r0, r0, #1
|
||||||
|
vrshr.u16 d0, d0, #5
|
||||||
|
vdup.8 q0, d0[0]
|
||||||
|
pred16x16_dc_end:
|
||||||
|
.rept 16
|
||||||
|
vst1.64 {d0-d1}, [r0,:128], r1
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_h_neon
|
||||||
|
sub r1, r0, #1
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
.rept 8
|
||||||
|
vld1.8 {d0[]}, [r1], ip
|
||||||
|
vmov d1, d0
|
||||||
|
vld1.8 {d2[]}, [r1], ip
|
||||||
|
vmov d3, d2
|
||||||
|
vst1.64 {d0-d1}, [r0,:128], ip
|
||||||
|
vst1.64 {d2-d3}, [r0,:128], ip
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_v_neon
|
||||||
|
sub r0, r0, #FDEC_STRIDE
|
||||||
|
mov ip, #FDEC_STRIDE
|
||||||
|
vld1.64 {d0-d1}, [r0,:128], ip
|
||||||
|
.rept 16
|
||||||
|
vst1.64 {d0-d1}, [r0,:128], ip
|
||||||
|
.endr
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function predict_16x16_p_neon
|
||||||
|
sub r3, r0, #FDEC_STRIDE
|
||||||
|
mov r1, #FDEC_STRIDE
|
||||||
|
add r2, r3, #8
|
||||||
|
sub r3, r3, #1
|
||||||
|
vld1.8 {d0}, [r3]
|
||||||
|
vld1.8 {d2}, [r2,:64], r1
|
||||||
|
ldcol.8 d1, r3, r1
|
||||||
|
add r3, r3, r1
|
||||||
|
ldcol.8 d3, r3, r1
|
||||||
|
vrev64.8 q0, q0
|
||||||
|
vaddl.u8 q8, d2, d3
|
||||||
|
vsubl.u8 q2, d2, d0
|
||||||
|
vsubl.u8 q3, d3, d1
|
||||||
|
movrel r3, p16weight
|
||||||
|
vld1.8 {q0}, [r3,:128]
|
||||||
|
vmul.s16 q2, q2, q0
|
||||||
|
vmul.s16 q3, q3, q0
|
||||||
|
vadd.i16 d4, d4, d5
|
||||||
|
vadd.i16 d5, d6, d7
|
||||||
|
vpadd.i16 d4, d4, d5
|
||||||
|
vpadd.i16 d4, d4, d4
|
||||||
|
vshll.s16 q3, d4, #2
|
||||||
|
vaddw.s16 q2, q3, d4
|
||||||
|
vrshrn.s32 d4, q2, #6
|
||||||
|
mov r3, #0
|
||||||
|
vtrn.16 d4, d5
|
||||||
|
vadd.i16 d2, d4, d5
|
||||||
|
vshl.i16 d3, d2, #3
|
||||||
|
vrev64.16 d16, d17
|
||||||
|
vsub.i16 d3, d3, d2
|
||||||
|
vadd.i16 d16, d16, d0
|
||||||
|
vshl.i16 d2, d16, #4
|
||||||
|
vsub.i16 d2, d2, d3
|
||||||
|
vshl.i16 d3, d4, #4
|
||||||
|
vext.16 q0, q0, q0, #7
|
||||||
|
vsub.i16 d6, d5, d3
|
||||||
|
vmov.16 d0[0], r3
|
||||||
|
vmul.i16 q0, q0, d4[0]
|
||||||
|
vdup.16 q1, d2[0]
|
||||||
|
vdup.16 q2, d4[0]
|
||||||
|
vdup.16 q3, d6[0]
|
||||||
|
vshl.i16 q2, q2, #3
|
||||||
|
vadd.i16 q1, q1, q0
|
||||||
|
vadd.i16 q3, q3, q2
|
||||||
|
mov r3, #16
|
||||||
|
1:
|
||||||
|
vqshrun.s16 d0, q1, #5
|
||||||
|
vadd.i16 q1, q1, q2
|
||||||
|
vqshrun.s16 d1, q1, #5
|
||||||
|
vadd.i16 q1, q1, q3
|
||||||
|
vst1.8 {q0}, [r0,:128], r1
|
||||||
|
subs r3, r3, #1
|
||||||
|
bne 1b
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
108
common/arm/predict-c.c
Normal file
108
common/arm/predict-c.c
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict.c: arm intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "predict.h"
|
||||||
|
#include "pixel.h"
|
||||||
|
|
||||||
|
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_ARMV6) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6;
|
||||||
|
pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6;
|
||||||
|
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6;
|
||||||
|
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
|
||||||
|
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
|
||||||
|
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
|
||||||
|
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
|
||||||
|
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
|
||||||
|
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
|
||||||
|
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
|
||||||
|
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
/* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
|
||||||
|
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon;
|
||||||
|
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon;
|
||||||
|
pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
|
||||||
|
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
|
||||||
|
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
|
||||||
|
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
|
||||||
|
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
|
||||||
|
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
|
||||||
|
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
|
||||||
|
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
|
||||||
|
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
if( !(cpu&X264_CPU_NEON) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
|
||||||
|
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
|
||||||
|
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
|
||||||
|
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
|
||||||
|
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
|
||||||
|
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
105
common/arm/predict.h
Normal file
105
common/arm/predict.h
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict.h: arm intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_ARM_PREDICT_H
|
||||||
|
#define X264_ARM_PREDICT_H
|
||||||
|
|
||||||
|
#define x264_predict_4x4_dc_armv6 x264_template(predict_4x4_dc_armv6)
|
||||||
|
void x264_predict_4x4_dc_armv6( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
|
||||||
|
void x264_predict_4x4_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_v_armv6 x264_template(predict_4x4_v_armv6)
|
||||||
|
void x264_predict_4x4_v_armv6( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_h_armv6 x264_template(predict_4x4_h_armv6)
|
||||||
|
void x264_predict_4x4_h_armv6( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_ddr_armv6 x264_template(predict_4x4_ddr_armv6)
|
||||||
|
void x264_predict_4x4_ddr_armv6( uint8_t *src );
|
||||||
|
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
|
||||||
|
void x264_predict_4x4_ddl_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
|
||||||
|
void x264_predict_8x8c_dc_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
|
||||||
|
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
|
||||||
|
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
|
||||||
|
void x264_predict_8x8c_h_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_v_neon x264_template(predict_8x8c_v_neon)
|
||||||
|
void x264_predict_8x8c_v_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
|
||||||
|
void x264_predict_8x8c_p_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
|
||||||
|
void x264_predict_8x16c_h_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
|
||||||
|
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
|
||||||
|
void x264_predict_8x16c_p_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
|
||||||
|
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
|
||||||
|
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
|
||||||
|
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
|
||||||
|
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
|
||||||
|
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
|
||||||
|
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
|
||||||
|
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
|
||||||
|
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
|
||||||
|
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
|
||||||
|
void x264_predict_16x16_dc_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
|
||||||
|
void x264_predict_16x16_dc_top_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
|
||||||
|
void x264_predict_16x16_dc_left_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
|
||||||
|
void x264_predict_16x16_h_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
|
||||||
|
void x264_predict_16x16_v_neon( uint8_t *src );
|
||||||
|
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
|
||||||
|
void x264_predict_16x16_p_neon( uint8_t *src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_init_arm x264_template(predict_4x4_init_arm)
|
||||||
|
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] );
|
||||||
|
#define x264_predict_8x8_init_arm x264_template(predict_8x8_init_arm)
|
||||||
|
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
|
||||||
|
#define x264_predict_8x8c_init_arm x264_template(predict_8x8c_init_arm)
|
||||||
|
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
|
||||||
|
#define x264_predict_8x16c_init_arm x264_template(predict_8x16c_init_arm)
|
||||||
|
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
|
||||||
|
#define x264_predict_16x16_init_arm x264_template(predict_16x16_init_arm)
|
||||||
|
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] );
|
||||||
|
|
||||||
|
#endif
|
||||||
574
common/arm/quant-a.S
Normal file
574
common/arm/quant-a.S
Normal file
@@ -0,0 +1,574 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
* quant.S: arm quantization and level-run
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2009-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
* Janne Grunau <janne-x264@jannau.net>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "asm.S"
|
||||||
|
|
||||||
|
const pmovmskb_byte, align=4
|
||||||
|
.byte 1,2,4,8,16,32,64,128
|
||||||
|
.byte 1,2,4,8,16,32,64,128
|
||||||
|
endconst
|
||||||
|
|
||||||
|
const mask_2bit, align=4
|
||||||
|
.byte 3,12,48,192,3,12,48,192
|
||||||
|
.byte 3,12,48,192,3,12,48,192
|
||||||
|
endconst
|
||||||
|
|
||||||
|
const mask_1bit, align=4
|
||||||
|
.byte 128,64,32,16,8,4,2,1
|
||||||
|
.byte 128,64,32,16,8,4,2,1
|
||||||
|
endconst
|
||||||
|
|
||||||
|
.text
|
||||||
|
|
||||||
|
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
|
||||||
|
vadd.u16 q8, q8, \bias0
|
||||||
|
vadd.u16 q9, q9, \bias1
|
||||||
|
.ifc \load_mf, yes
|
||||||
|
vld1.64 {\mf0-\mf3}, [r1,:128]!
|
||||||
|
.endif
|
||||||
|
vmull.u16 q10, d16, \mf0
|
||||||
|
vmull.u16 q11, d17, \mf1
|
||||||
|
vmull.u16 q12, d18, \mf2
|
||||||
|
vmull.u16 q13, d19, \mf3
|
||||||
|
vshr.s16 q14, q14, #15
|
||||||
|
vshr.s16 q15, q15, #15
|
||||||
|
vshrn.u32 d16, q10, #16
|
||||||
|
vshrn.u32 d17, q11, #16
|
||||||
|
vshrn.u32 d18, q12, #16
|
||||||
|
vshrn.u32 d19, q13, #16
|
||||||
|
veor q8, q8, q14
|
||||||
|
veor q9, q9, q15
|
||||||
|
vsub.s16 q8, q8, q14
|
||||||
|
vsub.s16 q9, q9, q15
|
||||||
|
vorr \mask, q8, q9
|
||||||
|
vst1.64 {d16-d19}, [r0,:128]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro QUANT_END d
|
||||||
|
vmov r2, r3, \d
|
||||||
|
orrs r0, r2, r3
|
||||||
|
movne r0, #1
|
||||||
|
bx lr
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
|
||||||
|
function quant_2x2_dc_neon
|
||||||
|
vld1.64 {d0}, [r0,:64]
|
||||||
|
vabs.s16 d3, d0
|
||||||
|
vdup.16 d2, r2
|
||||||
|
vdup.16 d1, r1
|
||||||
|
vadd.u16 d3, d3, d2
|
||||||
|
vmull.u16 q3, d3, d1
|
||||||
|
vshr.s16 d0, d0, #15
|
||||||
|
vshrn.u32 d3, q3, #16
|
||||||
|
veor d3, d3, d0
|
||||||
|
vsub.s16 d3, d3, d0
|
||||||
|
vst1.64 {d3}, [r0,:64]
|
||||||
|
QUANT_END d3
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
|
||||||
|
function quant_4x4_dc_neon
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
vdup.16 q0, r2
|
||||||
|
vdup.16 q2, r1
|
||||||
|
QUANT_TWO q0, q0, d4, d5, d4, d5, q0
|
||||||
|
vorr d0, d0, d1
|
||||||
|
QUANT_END d0
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
|
||||||
|
function quant_4x4_neon
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
vld1.64 {d0-d3}, [r2,:128]
|
||||||
|
vld1.64 {d4-d7}, [r1,:128]
|
||||||
|
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
|
||||||
|
vorr d0, d0, d1
|
||||||
|
QUANT_END d0
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
|
||||||
|
function quant_4x4x4_neon
|
||||||
|
vpush {d8-d15}
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
vld1.64 {d0-d3}, [r2,:128]
|
||||||
|
vld1.64 {d4-d7}, [r1,:128]
|
||||||
|
QUANT_TWO q0, q1, d4, d5, d6, d7, q4
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
QUANT_TWO q0, q1, d4, d5, d6, d7, q5
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
QUANT_TWO q0, q1, d4, d5, d6, d7, q6
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
QUANT_TWO q0, q1, d4, d5, d6, d7, q7
|
||||||
|
vorr d8, d8, d9
|
||||||
|
vorr d10, d10, d11
|
||||||
|
vorr d12, d12, d13
|
||||||
|
vorr d14, d14, d15
|
||||||
|
vmov r0, r1, d8
|
||||||
|
vmov r2, r3, d10
|
||||||
|
orrs r0, r1
|
||||||
|
movne r0, #1
|
||||||
|
orrs r2, r3
|
||||||
|
orrne r0, #2
|
||||||
|
vmov r1, r2, d12
|
||||||
|
vmov r3, ip, d14
|
||||||
|
orrs r1, r2
|
||||||
|
orrne r0, #4
|
||||||
|
orrs r3, ip
|
||||||
|
orrne r0, #8
|
||||||
|
vpop {d8-d15}
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
|
||||||
|
function quant_8x8_neon
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
vld1.64 {d0-d3}, [r2,:128]!
|
||||||
|
vld1.64 {d4-d7}, [r1,:128]!
|
||||||
|
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
|
||||||
|
.rept 3
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]
|
||||||
|
vabs.s16 q8, q14
|
||||||
|
vabs.s16 q9, q15
|
||||||
|
vld1.64 {d2-d5}, [r2,:128]!
|
||||||
|
QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
|
||||||
|
vorr q0, q0, q1
|
||||||
|
.endr
|
||||||
|
vorr d0, d0, d1
|
||||||
|
QUANT_END d0
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro DEQUANT_START mf_size offset dc=no
|
||||||
|
mov r3, #0x2b
|
||||||
|
mul r3, r3, r2
|
||||||
|
lsr r3, r3, #8 // i_qbits = i_qp / 6
|
||||||
|
add ip, r3, r3, lsl #1
|
||||||
|
sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
|
||||||
|
.ifc \dc,no
|
||||||
|
add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
|
||||||
|
.else
|
||||||
|
ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
|
||||||
|
.endif
|
||||||
|
subs r3, r3, #\offset // 6 for 8x8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
|
||||||
|
.macro DEQUANT size bits
|
||||||
|
function dequant_\size\()_neon
|
||||||
|
DEQUANT_START \bits+2, \bits
|
||||||
|
.ifc \size, 8x8
|
||||||
|
mov r2, #4
|
||||||
|
.endif
|
||||||
|
blt dequant_\size\()_rshift
|
||||||
|
|
||||||
|
vdup.16 q15, r3
|
||||||
|
dequant_\size\()_lshift_loop:
|
||||||
|
.ifc \size, 8x8
|
||||||
|
subs r2, r2, #1
|
||||||
|
.endif
|
||||||
|
vld1.32 {d16-d17}, [r1,:128]!
|
||||||
|
vld1.32 {d18-d19}, [r1,:128]!
|
||||||
|
vmovn.s32 d4, q8
|
||||||
|
vld1.32 {d20-d21}, [r1,:128]!
|
||||||
|
vmovn.s32 d5, q9
|
||||||
|
vld1.32 {d22-d23}, [r1,:128]!
|
||||||
|
vmovn.s32 d6, q10
|
||||||
|
vld1.16 {d0-d3}, [r0,:128]
|
||||||
|
vmovn.s32 d7, q11
|
||||||
|
vmul.s16 q0, q0, q2
|
||||||
|
vmul.s16 q1, q1, q3
|
||||||
|
vshl.s16 q0, q0, q15
|
||||||
|
vshl.s16 q1, q1, q15
|
||||||
|
vst1.16 {d0-d3}, [r0,:128]!
|
||||||
|
.ifc \size, 8x8
|
||||||
|
bgt dequant_\size\()_lshift_loop
|
||||||
|
.endif
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
dequant_\size\()_rshift:
|
||||||
|
vdup.32 q15, r3
|
||||||
|
rsb r3, r3, #0
|
||||||
|
mov ip, #1
|
||||||
|
sub r3, r3, #1
|
||||||
|
lsl ip, ip, r3
|
||||||
|
|
||||||
|
.ifc \size, 8x8
|
||||||
|
dequant_\size\()_rshift_loop:
|
||||||
|
subs r2, r2, #1
|
||||||
|
.endif
|
||||||
|
vdup.32 q10, ip
|
||||||
|
vld1.32 {d16-d17}, [r1,:128]!
|
||||||
|
vdup.32 q11, ip
|
||||||
|
vld1.32 {d18-d19}, [r1,:128]!
|
||||||
|
vmovn.s32 d4, q8
|
||||||
|
vld1.32 {d16-d17}, [r1,:128]!
|
||||||
|
vmovn.s32 d5, q9
|
||||||
|
vld1.32 {d18-d19}, [r1,:128]!
|
||||||
|
vmovn.s32 d6, q8
|
||||||
|
vld1.16 {d0-d3}, [r0,:128]
|
||||||
|
vmovn.s32 d7, q9
|
||||||
|
vdup.32 q12, ip
|
||||||
|
vdup.32 q13, ip
|
||||||
|
|
||||||
|
vmlal.s16 q10, d0, d4
|
||||||
|
vmlal.s16 q11, d1, d5
|
||||||
|
vmlal.s16 q12, d2, d6
|
||||||
|
vmlal.s16 q13, d3, d7
|
||||||
|
vshl.s32 q10, q10, q15
|
||||||
|
vshl.s32 q11, q11, q15
|
||||||
|
vshl.s32 q12, q12, q15
|
||||||
|
vshl.s32 q13, q13, q15
|
||||||
|
|
||||||
|
vmovn.s32 d0, q10
|
||||||
|
vmovn.s32 d1, q11
|
||||||
|
vmovn.s32 d2, q12
|
||||||
|
vmovn.s32 d3, q13
|
||||||
|
vst1.16 {d0-d3}, [r0,:128]!
|
||||||
|
.ifc \size, 8x8
|
||||||
|
bgt dequant_\size\()_rshift_loop
|
||||||
|
.endif
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
DEQUANT 4x4, 4
|
||||||
|
DEQUANT 8x8, 6
|
||||||
|
|
||||||
|
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
|
||||||
|
function dequant_4x4_dc_neon
|
||||||
|
DEQUANT_START 6, 6, yes
|
||||||
|
blt dequant_4x4_dc_rshift
|
||||||
|
|
||||||
|
lsl r1, r1, r3
|
||||||
|
vdup.16 q2, r1
|
||||||
|
vld1.16 {d0-d3}, [r0,:128]
|
||||||
|
vdup.16 q15, r3
|
||||||
|
|
||||||
|
vmul.s16 q0, q0, q2
|
||||||
|
vmul.s16 q1, q1, q2
|
||||||
|
vst1.16 {d0-d3}, [r0,:128]
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
dequant_4x4_dc_rshift:
|
||||||
|
vdup.16 d4, r1
|
||||||
|
vdup.32 q15, r3
|
||||||
|
rsb r3, r3, #0
|
||||||
|
mov ip, #1
|
||||||
|
sub r3, r3, #1
|
||||||
|
lsl ip, ip, r3
|
||||||
|
|
||||||
|
vdup.32 q10, ip
|
||||||
|
vdup.32 q11, ip
|
||||||
|
vld1.16 {d0-d3}, [r0,:128]
|
||||||
|
vdup.32 q12, ip
|
||||||
|
vdup.32 q13, ip
|
||||||
|
|
||||||
|
vmlal.s16 q10, d0, d4
|
||||||
|
vmlal.s16 q11, d1, d4
|
||||||
|
vmlal.s16 q12, d2, d4
|
||||||
|
vmlal.s16 q13, d3, d4
|
||||||
|
vshl.s32 q10, q10, q15
|
||||||
|
vshl.s32 q11, q11, q15
|
||||||
|
vshl.s32 q12, q12, q15
|
||||||
|
vshl.s32 q13, q13, q15
|
||||||
|
|
||||||
|
vmovn.s32 d0, q10
|
||||||
|
vmovn.s32 d1, q11
|
||||||
|
vmovn.s32 d2, q12
|
||||||
|
vmovn.s32 d3, q13
|
||||||
|
vst1.16 {d0-d3}, [r0,:128]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro decimate_score_1x size
|
||||||
|
function decimate_score\size\()_neon
|
||||||
|
vld1.16 {q0, q1}, [r0, :128]
|
||||||
|
movrel r3, mask_2bit
|
||||||
|
vmov.s8 q3, #0x01
|
||||||
|
vqmovn.s16 d0, q0
|
||||||
|
vqmovn.s16 d1, q1
|
||||||
|
vqabs.s8 q2, q0
|
||||||
|
vld1.8 {q8}, [r3, :128]
|
||||||
|
vceq.s8 q1, q0, #0
|
||||||
|
vcgt.s8 q2, q2, q3
|
||||||
|
vand.u8 q1, q1, q8
|
||||||
|
vshrn.u16 d4, q2, #4
|
||||||
|
vpadd.u8 d2, d2, d3
|
||||||
|
vpadd.u8 d4, d4, d4
|
||||||
|
vpadd.u8 d2, d2, d2
|
||||||
|
vmov.32 r2, d4[0]
|
||||||
|
vmov.32 r1, d2[0]
|
||||||
|
cmp r2, #0
|
||||||
|
beq 0f
|
||||||
|
mov r0, #9
|
||||||
|
bx lr
|
||||||
|
0:
|
||||||
|
mvns r1, r1
|
||||||
|
mov r0, #0
|
||||||
|
bxeq lr
|
||||||
|
.ifc \size, 15
|
||||||
|
lsr r1, r1, #2
|
||||||
|
.endif
|
||||||
|
rbit r1, r1
|
||||||
|
movrelx r3, X264(decimate_table4), r2
|
||||||
|
1:
|
||||||
|
clz r2, r1
|
||||||
|
lsl r1, r1, r2
|
||||||
|
lsr r12, r2, #1
|
||||||
|
ldrb r2, [r3, r12]
|
||||||
|
lsls r1, r1, #2
|
||||||
|
add r0, r0, r2
|
||||||
|
bne 1b
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
decimate_score_1x 15
|
||||||
|
decimate_score_1x 16
|
||||||
|
|
||||||
|
function decimate_score64_neon
|
||||||
|
push {lr}
|
||||||
|
vld1.16 {q8, q9}, [r0, :128]!
|
||||||
|
vld1.16 {q10, q11}, [r0, :128]!
|
||||||
|
vld1.16 {q12, q13}, [r0, :128]!
|
||||||
|
vld1.16 {q14, q15}, [r0, :128]
|
||||||
|
movrel r3, mask_1bit
|
||||||
|
vmov.s8 q3, #0x01
|
||||||
|
vqmovn.s16 d17, q8
|
||||||
|
vqmovn.s16 d16, q9
|
||||||
|
vqmovn.s16 d19, q10
|
||||||
|
vqmovn.s16 d18, q11
|
||||||
|
vqmovn.s16 d21, q12
|
||||||
|
vqmovn.s16 d20, q13
|
||||||
|
vqmovn.s16 d23, q14
|
||||||
|
vqmovn.s16 d22, q15
|
||||||
|
vqabs.s8 q12, q8
|
||||||
|
vqabs.s8 q13, q9
|
||||||
|
vqabs.s8 q14, q10
|
||||||
|
vqabs.s8 q15, q11
|
||||||
|
vld1.8 {q2}, [r3, :128]
|
||||||
|
vceq.s8 q8, q8, #0
|
||||||
|
vceq.s8 q9, q9, #0
|
||||||
|
vceq.s8 q10, q10, #0
|
||||||
|
vceq.s8 q11, q11, #0
|
||||||
|
vmax.s8 q12, q12, q13
|
||||||
|
vmax.s8 q14, q14, q15
|
||||||
|
vand.u8 q8, q8, q2
|
||||||
|
vand.u8 q9, q9, q2
|
||||||
|
vand.u8 q10, q10, q2
|
||||||
|
vand.u8 q11, q11, q2
|
||||||
|
vmax.s8 q12, q12, q14
|
||||||
|
vpadd.u8 d18, d18, d19
|
||||||
|
vpadd.u8 d19, d16, d17
|
||||||
|
vcgt.s8 q12, q12, q3
|
||||||
|
vpadd.u8 d22, d22, d23
|
||||||
|
vpadd.u8 d23, d20, d21
|
||||||
|
vshrn.u16 d24, q12, #4
|
||||||
|
vpadd.u8 d16, d22, d23
|
||||||
|
vpadd.u8 d17, d18, d19
|
||||||
|
vpadd.u8 d24, d24, d24
|
||||||
|
vpadd.u8 d16, d16, d17
|
||||||
|
vmov.32 r2, d24[0]
|
||||||
|
vmov r12, r1, d16
|
||||||
|
cmp r2, #0
|
||||||
|
beq 0f
|
||||||
|
mov r0, #9
|
||||||
|
pop {pc}
|
||||||
|
0:
|
||||||
|
mvns r1, r1
|
||||||
|
mvn r12, r12
|
||||||
|
mov r0, #0
|
||||||
|
mov lr, #32
|
||||||
|
movrelx r3, X264(decimate_table8), r2
|
||||||
|
beq 2f
|
||||||
|
1:
|
||||||
|
clz r2, r1
|
||||||
|
lsl r1, r1, r2
|
||||||
|
sub lr, lr, r2
|
||||||
|
ldrb r2, [r3, r2]
|
||||||
|
lsls r1, r1, #1
|
||||||
|
sub lr, lr, #1
|
||||||
|
add r0, r0, r2
|
||||||
|
bne 1b
|
||||||
|
2:
|
||||||
|
cmp r12, #0
|
||||||
|
popeq {pc}
|
||||||
|
|
||||||
|
clz r2, r12
|
||||||
|
lsl r1, r12, r2
|
||||||
|
add r2, r2, lr
|
||||||
|
ldrb r2, [r3, r2]
|
||||||
|
lsls r1, r1, #1
|
||||||
|
add r0, r0, r2
|
||||||
|
popeq {pc}
|
||||||
|
3:
|
||||||
|
clz r2, r1
|
||||||
|
lsl r1, r1, r2
|
||||||
|
ldrb r2, [r3, r2]
|
||||||
|
lsls r1, r1, #1
|
||||||
|
add r0, r0, r2
|
||||||
|
bne 3b
|
||||||
|
pop {pc}
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
// int coeff_last( int16_t *l )
|
||||||
|
function coeff_last4_arm
|
||||||
|
ldrd r2, r3, [r0]
|
||||||
|
subs r0, r3, #0
|
||||||
|
movne r0, #2
|
||||||
|
movne r2, r3
|
||||||
|
lsrs r2, r2, #16
|
||||||
|
addne r0, r0, #1
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function coeff_last8_arm
|
||||||
|
ldrd r2, r3, [r0, #8]
|
||||||
|
orrs ip, r2, r3
|
||||||
|
movne r0, #4
|
||||||
|
ldrdeq r2, r3, [r0]
|
||||||
|
moveq r0, #0
|
||||||
|
tst r3, r3
|
||||||
|
addne r0, #2
|
||||||
|
movne r2, r3
|
||||||
|
lsrs r2, r2, #16
|
||||||
|
addne r0, r0, #1
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
.macro COEFF_LAST_1x size
|
||||||
|
function coeff_last\size\()_neon
|
||||||
|
.if \size == 15
|
||||||
|
sub r0, r0, #2
|
||||||
|
.endif
|
||||||
|
vld1.64 {d0-d3}, [r0,:128]
|
||||||
|
vtst.16 q0, q0
|
||||||
|
vtst.16 q1, q1
|
||||||
|
vshrn.u16 d0, q0, #8
|
||||||
|
vshrn.u16 d1, q1, #8
|
||||||
|
vshrn.u16 d0, q0, #4
|
||||||
|
vclz.i32 d0, d0
|
||||||
|
mov ip, #7
|
||||||
|
mov r3, #\size - 9
|
||||||
|
vmov r0, r1, d0
|
||||||
|
|
||||||
|
subs r1, ip, r1, lsr #2
|
||||||
|
addge r0, r1, #\size - 8
|
||||||
|
subslt r0, r3, r0, lsr #2
|
||||||
|
movlt r0, #0
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
COEFF_LAST_1x 15
|
||||||
|
COEFF_LAST_1x 16
|
||||||
|
|
||||||
|
function coeff_last64_neon
|
||||||
|
vld1.64 {d16-d19}, [r0,:128]!
|
||||||
|
vqmovn.u16 d16, q8
|
||||||
|
vqmovn.u16 d17, q9
|
||||||
|
vld1.64 {d20-d23}, [r0,:128]!
|
||||||
|
vqmovn.u16 d18, q10
|
||||||
|
vqmovn.u16 d19, q11
|
||||||
|
vld1.64 {d24-d27}, [r0,:128]!
|
||||||
|
vqmovn.u16 d20, q12
|
||||||
|
vqmovn.u16 d21, q13
|
||||||
|
vld1.64 {d28-d31}, [r0,:128]!
|
||||||
|
vqmovn.u16 d22, q14
|
||||||
|
vqmovn.u16 d23, q15
|
||||||
|
|
||||||
|
movrel r1, pmovmskb_byte
|
||||||
|
vld1.64 {d0-d1}, [r1,:128]
|
||||||
|
|
||||||
|
vtst.8 q8, q8
|
||||||
|
vtst.8 q9, q9
|
||||||
|
vtst.8 q10, q10
|
||||||
|
vtst.8 q11, q11
|
||||||
|
|
||||||
|
vand q8, q8, q0
|
||||||
|
vand q9, q9, q0
|
||||||
|
vand q10, q10, q0
|
||||||
|
vand q11, q11, q0
|
||||||
|
|
||||||
|
vpadd.u8 d0, d16, d17
|
||||||
|
vpadd.u8 d1, d18, d19
|
||||||
|
vpadd.u8 d2, d20, d21
|
||||||
|
vpadd.u8 d3, d22, d23
|
||||||
|
vpadd.u8 d0, d0, d1
|
||||||
|
vpadd.u8 d1, d2, d3
|
||||||
|
vpadd.u8 d0, d0, d1
|
||||||
|
vclz.i32 d0, d0
|
||||||
|
mov ip, #31
|
||||||
|
vmov r0, r1, d0
|
||||||
|
|
||||||
|
subs r1, ip, r1
|
||||||
|
addge r0, r1, #32
|
||||||
|
subslt r0, ip, r0
|
||||||
|
movlt r0, #0
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
function denoise_dct_neon
|
||||||
|
1: subs r3, r3, #16
|
||||||
|
vld1.16 {q0, q1}, [r0]
|
||||||
|
vld1.32 {q12, q13}, [r1]!
|
||||||
|
vld1.32 {q14, q15}, [r1]
|
||||||
|
sub r1, #32
|
||||||
|
vabs.s16 q8, q0
|
||||||
|
vabs.s16 q9, q1
|
||||||
|
vld1.16 {q2, q3}, [r2]!
|
||||||
|
vclt.s16 q10, q0, #0
|
||||||
|
vclt.s16 q11, q1, #0
|
||||||
|
vaddw.u16 q12, q12, d16
|
||||||
|
vaddw.u16 q13, q13, d17
|
||||||
|
vqsub.u16 q0, q8, q2
|
||||||
|
vqsub.u16 q1, q9, q3
|
||||||
|
vaddw.u16 q14, q14, d18
|
||||||
|
vaddw.u16 q15, q15, d19
|
||||||
|
vneg.s16 q8, q0
|
||||||
|
vneg.s16 q9, q1
|
||||||
|
vbsl q10, q8, q0
|
||||||
|
vbsl q11, q9, q1
|
||||||
|
vst1.32 {q12, q13}, [r1]!
|
||||||
|
vst1.32 {q14, q15}, [r1]!
|
||||||
|
vst1.16 {q10, q11}, [r0]!
|
||||||
|
bgt 1b
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
71
common/arm/quant.h
Normal file
71
common/arm/quant.h
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* quant.h: arm quantization and level-run
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2005-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: David Conrad <lessen42@gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_ARM_QUANT_H
|
||||||
|
#define X264_ARM_QUANT_H
|
||||||
|
|
||||||
|
#define x264_quant_2x2_dc_armv6 x264_template(quant_2x2_dc_armv6)
|
||||||
|
int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
|
||||||
|
|
||||||
|
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
|
||||||
|
int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
|
||||||
|
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
|
||||||
|
int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
|
||||||
|
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
|
||||||
|
int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
|
||||||
|
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
|
||||||
|
int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
|
||||||
|
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
|
||||||
|
int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
|
||||||
|
|
||||||
|
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
|
||||||
|
void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
|
||||||
|
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
|
||||||
|
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
|
||||||
|
|
||||||
|
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
|
||||||
|
int x264_decimate_score15_neon( int16_t * );
|
||||||
|
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
|
||||||
|
int x264_decimate_score16_neon( int16_t * );
|
||||||
|
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
|
||||||
|
int x264_decimate_score64_neon( int16_t * );
|
||||||
|
|
||||||
|
#define x264_coeff_last4_arm x264_template(coeff_last4_arm)
|
||||||
|
int x264_coeff_last4_arm( int16_t * );
|
||||||
|
#define x264_coeff_last8_arm x264_template(coeff_last8_arm)
|
||||||
|
int x264_coeff_last8_arm( int16_t * );
|
||||||
|
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
|
||||||
|
int x264_coeff_last15_neon( int16_t * );
|
||||||
|
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
|
||||||
|
int x264_coeff_last16_neon( int16_t * );
|
||||||
|
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
|
||||||
|
int x264_coeff_last64_neon( int16_t * );
|
||||||
|
|
||||||
|
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
|
||||||
|
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
|
||||||
|
|
||||||
|
#endif
|
||||||
1567
common/base.c
Normal file
1567
common/base.c
Normal file
File diff suppressed because it is too large
Load Diff
339
common/base.h
Normal file
339
common/base.h
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* base.h: misc common functions (bit depth independent)
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_BASE_H
|
||||||
|
#define X264_BASE_H
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Macros (can be used in osdep.h)
|
||||||
|
****************************************************************************/
|
||||||
|
#define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
|
||||||
|
#define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
|
||||||
|
#define X264_MIN3(a,b,c) X264_MIN((a),X264_MIN((b),(c)))
|
||||||
|
#define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
|
||||||
|
#define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
|
||||||
|
#define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* System includes
|
||||||
|
****************************************************************************/
|
||||||
|
#include "osdep.h"
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Macros
|
||||||
|
****************************************************************************/
|
||||||
|
#define XCHG(type,a,b) do { type t = a; a = b; b = t; } while( 0 )
|
||||||
|
#define FIX8(f) ((int)(f*(1<<8)+.5))
|
||||||
|
#define ARRAY_ELEMS(a) ((int)((sizeof(a))/(sizeof(a[0]))))
|
||||||
|
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
|
||||||
|
#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
|
||||||
|
|
||||||
|
/* Unions for type-punning.
|
||||||
|
* Mn: load or store n bits, aligned, native-endian
|
||||||
|
* CPn: copy n bits, aligned, native-endian
|
||||||
|
* we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
|
||||||
|
typedef union { uint16_t i; uint8_t b[2]; } MAY_ALIAS x264_union16_t;
|
||||||
|
typedef union { uint32_t i; uint16_t w[2]; uint8_t b[4]; } MAY_ALIAS x264_union32_t;
|
||||||
|
typedef union { uint64_t i; uint32_t d[2]; uint16_t w[4]; uint8_t b[8]; } MAY_ALIAS x264_union64_t;
|
||||||
|
typedef struct { uint64_t i[2]; } x264_uint128_t;
|
||||||
|
typedef union { x264_uint128_t i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_t;
|
||||||
|
#define M16(src) (((x264_union16_t*)(src))->i)
|
||||||
|
#define M32(src) (((x264_union32_t*)(src))->i)
|
||||||
|
#define M64(src) (((x264_union64_t*)(src))->i)
|
||||||
|
#define M128(src) (((x264_union128_t*)(src))->i)
|
||||||
|
#define M128_ZERO ((x264_uint128_t){{0,0}})
|
||||||
|
#define CP16(dst,src) M16(dst) = M16(src)
|
||||||
|
#define CP32(dst,src) M32(dst) = M32(src)
|
||||||
|
#define CP64(dst,src) M64(dst) = M64(src)
|
||||||
|
#define CP128(dst,src) M128(dst) = M128(src)
|
||||||
|
|
||||||
|
/* Macros for memory constraints of inline asm */
|
||||||
|
#if defined(__GNUC__) && __GNUC__ >= 8 && !defined(__clang__) && !defined(__INTEL_COMPILER)
|
||||||
|
#define MEM_FIX(x, t, s) (*(t (*)[s])(x))
|
||||||
|
#define MEM_DYN(x, t) (*(t (*)[])(x))
|
||||||
|
#else
|
||||||
|
//older versions of gcc prefer casting to structure instead of array
|
||||||
|
#define MEM_FIX(x, t, s) (*(struct { t a[s]; } MAY_ALIAS (*))(x))
|
||||||
|
//let's set an arbitrary large constant size
|
||||||
|
#define MEM_DYN(x, t) MEM_FIX(x, t, 4096)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Constants
|
||||||
|
****************************************************************************/
|
||||||
|
enum profile_e
|
||||||
|
{
|
||||||
|
PROFILE_BASELINE = 66,
|
||||||
|
PROFILE_MAIN = 77,
|
||||||
|
PROFILE_HIGH = 100,
|
||||||
|
PROFILE_HIGH10 = 110,
|
||||||
|
PROFILE_HIGH422 = 122,
|
||||||
|
PROFILE_HIGH444_PREDICTIVE = 244,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum chroma_format_e
|
||||||
|
{
|
||||||
|
CHROMA_400 = 0,
|
||||||
|
CHROMA_420 = 1,
|
||||||
|
CHROMA_422 = 2,
|
||||||
|
CHROMA_444 = 3,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum slice_type_e
|
||||||
|
{
|
||||||
|
SLICE_TYPE_P = 0,
|
||||||
|
SLICE_TYPE_B = 1,
|
||||||
|
SLICE_TYPE_I = 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char slice_type_to_char[] = { 'P', 'B', 'I' };
|
||||||
|
|
||||||
|
enum sei_payload_type_e
|
||||||
|
{
|
||||||
|
SEI_BUFFERING_PERIOD = 0,
|
||||||
|
SEI_PIC_TIMING = 1,
|
||||||
|
SEI_PAN_SCAN_RECT = 2,
|
||||||
|
SEI_FILLER = 3,
|
||||||
|
SEI_USER_DATA_REGISTERED = 4,
|
||||||
|
SEI_USER_DATA_UNREGISTERED = 5,
|
||||||
|
SEI_RECOVERY_POINT = 6,
|
||||||
|
SEI_DEC_REF_PIC_MARKING = 7,
|
||||||
|
SEI_FRAME_PACKING = 45,
|
||||||
|
SEI_MASTERING_DISPLAY = 137,
|
||||||
|
SEI_CONTENT_LIGHT_LEVEL = 144,
|
||||||
|
SEI_ALTERNATIVE_TRANSFER = 147,
|
||||||
|
};
|
||||||
|
|
||||||
|
#define X264_BFRAME_MAX 16
|
||||||
|
#define X264_REF_MAX 16
|
||||||
|
#define X264_THREAD_MAX 128
|
||||||
|
#define X264_LOOKAHEAD_THREAD_MAX 16
|
||||||
|
#define X264_LOOKAHEAD_MAX 250
|
||||||
|
|
||||||
|
// number of pixels (per thread) in progress at any given time.
|
||||||
|
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
|
||||||
|
#define X264_THREAD_HEIGHT 24
|
||||||
|
|
||||||
|
/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
|
||||||
|
* (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
|
||||||
|
* to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
|
||||||
|
* real weights are being used. */
|
||||||
|
|
||||||
|
#define X264_WEIGHTP_FAKE (-1)
|
||||||
|
|
||||||
|
#define X264_SCAN8_LUMA_SIZE (5*8)
|
||||||
|
#define X264_SCAN8_SIZE (X264_SCAN8_LUMA_SIZE*3)
|
||||||
|
#define X264_SCAN8_0 (4+1*8)
|
||||||
|
|
||||||
|
/* Scan8 organization:
|
||||||
|
* 0 1 2 3 4 5 6 7
|
||||||
|
* 0 DY y y y y y
|
||||||
|
* 1 y Y Y Y Y
|
||||||
|
* 2 y Y Y Y Y
|
||||||
|
* 3 y Y Y Y Y
|
||||||
|
* 4 y Y Y Y Y
|
||||||
|
* 5 DU u u u u u
|
||||||
|
* 6 u U U U U
|
||||||
|
* 7 u U U U U
|
||||||
|
* 8 u U U U U
|
||||||
|
* 9 u U U U U
|
||||||
|
* 10 DV v v v v v
|
||||||
|
* 11 v V V V V
|
||||||
|
* 12 v V V V V
|
||||||
|
* 13 v V V V V
|
||||||
|
* 14 v V V V V
|
||||||
|
* DY/DU/DV are for luma/chroma DC.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define LUMA_DC 48
|
||||||
|
#define CHROMA_DC 49
|
||||||
|
|
||||||
|
static const uint8_t x264_scan8[16*3 + 3] =
|
||||||
|
{
|
||||||
|
4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
|
||||||
|
6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
|
||||||
|
4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
|
||||||
|
6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
|
||||||
|
4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
|
||||||
|
6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
|
||||||
|
4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
|
||||||
|
6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
|
||||||
|
4+11*8, 5+11*8, 4+12*8, 5+12*8,
|
||||||
|
6+11*8, 7+11*8, 6+12*8, 7+12*8,
|
||||||
|
4+13*8, 5+13*8, 4+14*8, 5+14*8,
|
||||||
|
6+13*8, 7+13*8, 6+14*8, 7+14*8,
|
||||||
|
0+ 0*8, 0+ 5*8, 0+10*8
|
||||||
|
};
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Includes
|
||||||
|
****************************************************************************/
|
||||||
|
#include "cpu.h"
|
||||||
|
#include "tables.h"
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Inline functions
|
||||||
|
****************************************************************************/
|
||||||
|
static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
|
||||||
|
{
|
||||||
|
return ( (v < i_min) ? i_min : (v > i_max) ? i_max : v );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE double x264_clip3f( double v, double f_min, double f_max )
|
||||||
|
{
|
||||||
|
return ( (v < f_min) ? f_min : (v > f_max) ? f_max : v );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Not a general-purpose function; multiplies input by -1/6 to convert
|
||||||
|
* qp to qscale. */
|
||||||
|
static ALWAYS_INLINE int x264_exp2fix8( float x )
|
||||||
|
{
|
||||||
|
int i = x*(-64.f/6.f) + 512.5f;
|
||||||
|
if( i < 0 ) return 0;
|
||||||
|
if( i > 1023 ) return 0xffff;
|
||||||
|
return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE float x264_log2( uint32_t x )
|
||||||
|
{
|
||||||
|
int lz = x264_clz( x );
|
||||||
|
return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_median( int a, int b, int c )
|
||||||
|
{
|
||||||
|
int t = (a-b)&((a-b)>>31);
|
||||||
|
a -= t;
|
||||||
|
b += t;
|
||||||
|
b -= (b-c)&((b-c)>>31);
|
||||||
|
b += (a-b)&((a-b)>>31);
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void x264_median_mv( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
|
||||||
|
{
|
||||||
|
dst[0] = x264_median( a[0], b[0], c[0] );
|
||||||
|
dst[1] = x264_median( a[1], b[1], c[1] );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
|
||||||
|
{
|
||||||
|
int sum = 0;
|
||||||
|
for( int i = 0; i < i_mvc-1; i++ )
|
||||||
|
{
|
||||||
|
sum += abs( mvc[i][0] - mvc[i+1][0] )
|
||||||
|
+ abs( mvc[i][1] - mvc[i+1][1] );
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
|
||||||
|
{
|
||||||
|
int amvd0 = mvdleft[0] + mvdtop[0];
|
||||||
|
int amvd1 = mvdleft[1] + mvdtop[1];
|
||||||
|
amvd0 = (amvd0 > 2) + (amvd0 > 32);
|
||||||
|
amvd1 = (amvd1 > 2) + (amvd1 > 32);
|
||||||
|
return amvd0 + (amvd1<<8);
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* General functions
|
||||||
|
****************************************************************************/
|
||||||
|
X264_API void x264_reduce_fraction( uint32_t *n, uint32_t *d );
|
||||||
|
X264_API void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
|
||||||
|
|
||||||
|
X264_API void x264_log_default( void *p_unused, int i_level, const char *psz_fmt, va_list arg );
|
||||||
|
X264_API void x264_log_internal( int i_level, const char *psz_fmt, ... );
|
||||||
|
|
||||||
|
/* x264_malloc: will do or emulate a memalign
|
||||||
|
* you have to use x264_free for buffers allocated with x264_malloc */
|
||||||
|
X264_API void *x264_malloc( int64_t );
|
||||||
|
X264_API void x264_free( void * );
|
||||||
|
|
||||||
|
/* x264_slurp_file: malloc space for the whole file and read it */
|
||||||
|
X264_API char *x264_slurp_file( const char *filename );
|
||||||
|
|
||||||
|
/* x264_param_strdup: will do strdup and save returned pointer inside
|
||||||
|
* x264_param_t for later freeing during x264_param_cleanup */
|
||||||
|
char *x264_param_strdup( x264_param_t *param, const char *src );
|
||||||
|
|
||||||
|
/* x264_param2string: return a (malloced) string containing most of
|
||||||
|
* the encoding options */
|
||||||
|
X264_API char *x264_param2string( x264_param_t *p, int b_res );
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Macros
|
||||||
|
****************************************************************************/
|
||||||
|
#define CHECKED_MALLOC( var, size )\
|
||||||
|
do {\
|
||||||
|
var = x264_malloc( size );\
|
||||||
|
if( !var )\
|
||||||
|
goto fail;\
|
||||||
|
} while( 0 )
|
||||||
|
#define CHECKED_MALLOCZERO( var, size )\
|
||||||
|
do {\
|
||||||
|
CHECKED_MALLOC( var, size );\
|
||||||
|
memset( var, 0, size );\
|
||||||
|
} while( 0 )
|
||||||
|
#define CHECKED_PARAM_STRDUP( var, param, src )\
|
||||||
|
do {\
|
||||||
|
var = x264_param_strdup( param, src );\
|
||||||
|
if( !var )\
|
||||||
|
goto fail;\
|
||||||
|
} while( 0 )
|
||||||
|
|
||||||
|
/* Macros for merging multiple allocations into a single large malloc, for improved
|
||||||
|
* use with huge pages. */
|
||||||
|
|
||||||
|
/* Needs to be enough to contain any set of buffers that use combined allocations */
|
||||||
|
#define PREALLOC_BUF_SIZE 1024
|
||||||
|
|
||||||
|
#define PREALLOC_INIT\
|
||||||
|
int prealloc_idx = 0;\
|
||||||
|
int64_t prealloc_size = 0;\
|
||||||
|
uint8_t **preallocs[PREALLOC_BUF_SIZE];
|
||||||
|
|
||||||
|
#define PREALLOC( var, size )\
|
||||||
|
do {\
|
||||||
|
var = (void*)(intptr_t)prealloc_size;\
|
||||||
|
preallocs[prealloc_idx++] = (uint8_t**)&var;\
|
||||||
|
prealloc_size += ALIGN((int64_t)(size), NATIVE_ALIGN);\
|
||||||
|
} while( 0 )
|
||||||
|
|
||||||
|
#define PREALLOC_END( ptr )\
|
||||||
|
do {\
|
||||||
|
CHECKED_MALLOC( ptr, prealloc_size );\
|
||||||
|
while( prealloc_idx-- )\
|
||||||
|
*preallocs[prealloc_idx] = (uint8_t*)((intptr_t)(*preallocs[prealloc_idx]) + (intptr_t)ptr);\
|
||||||
|
} while( 0 )
|
||||||
|
|
||||||
|
#endif
|
||||||
166
common/bitstream.c
Normal file
166
common/bitstream.c
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* bitstream.c: bitstream writing
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
static uint8_t *nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
|
||||||
|
{
|
||||||
|
if( src < end ) *dst++ = *src++;
|
||||||
|
if( src < end ) *dst++ = *src++;
|
||||||
|
while( src < end )
|
||||||
|
{
|
||||||
|
if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
|
||||||
|
*dst++ = 0x03;
|
||||||
|
*dst++ = *src++;
|
||||||
|
}
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HAVE_MMX
|
||||||
|
#include "x86/bitstream.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
#include "arm/bitstream.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_AARCH64
|
||||||
|
#include "aarch64/bitstream.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* x264_nal_encode:
|
||||||
|
****************************************************************************/
|
||||||
|
void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
|
||||||
|
{
|
||||||
|
uint8_t *src = nal->p_payload;
|
||||||
|
uint8_t *end = nal->p_payload + nal->i_payload;
|
||||||
|
uint8_t *orig_dst = dst;
|
||||||
|
|
||||||
|
if( h->param.b_annexb )
|
||||||
|
{
|
||||||
|
if( nal->b_long_startcode )
|
||||||
|
*dst++ = 0x00;
|
||||||
|
*dst++ = 0x00;
|
||||||
|
*dst++ = 0x00;
|
||||||
|
*dst++ = 0x01;
|
||||||
|
}
|
||||||
|
else /* save room for size later */
|
||||||
|
dst += 4;
|
||||||
|
|
||||||
|
/* nal header */
|
||||||
|
*dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
|
||||||
|
|
||||||
|
dst = h->bsf.nal_escape( dst, src, end );
|
||||||
|
int size = dst - orig_dst;
|
||||||
|
|
||||||
|
/* Apply AVC-Intra padding */
|
||||||
|
if( h->param.i_avcintra_class )
|
||||||
|
{
|
||||||
|
int padding = nal->i_payload + nal->i_padding + NALU_OVERHEAD - size;
|
||||||
|
if( padding > 0 )
|
||||||
|
{
|
||||||
|
memset( dst, 0, padding );
|
||||||
|
size += padding;
|
||||||
|
}
|
||||||
|
nal->i_padding = X264_MAX( padding, 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the size header for mp4/etc */
|
||||||
|
if( !h->param.b_annexb )
|
||||||
|
{
|
||||||
|
/* Size doesn't include the size of the header we're writing now. */
|
||||||
|
int chunk_size = size - 4;
|
||||||
|
orig_dst[0] = (uint8_t)(chunk_size >> 24);
|
||||||
|
orig_dst[1] = (uint8_t)(chunk_size >> 16);
|
||||||
|
orig_dst[2] = (uint8_t)(chunk_size >> 8);
|
||||||
|
orig_dst[3] = (uint8_t)(chunk_size >> 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
nal->i_payload = size;
|
||||||
|
nal->p_payload = orig_dst;
|
||||||
|
x264_emms();
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf )
|
||||||
|
{
|
||||||
|
memset( pf, 0, sizeof(*pf) );
|
||||||
|
|
||||||
|
pf->nal_escape = nal_escape_c;
|
||||||
|
#if HAVE_MMX
|
||||||
|
#if ARCH_X86_64
|
||||||
|
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
|
||||||
|
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
|
||||||
|
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if( cpu&X264_CPU_MMX2 )
|
||||||
|
pf->nal_escape = x264_nal_escape_mmx2;
|
||||||
|
if( cpu&X264_CPU_SSE2 )
|
||||||
|
{
|
||||||
|
if( cpu&X264_CPU_SSE2_IS_FAST )
|
||||||
|
pf->nal_escape = x264_nal_escape_sse2;
|
||||||
|
}
|
||||||
|
#if ARCH_X86_64
|
||||||
|
if( cpu&X264_CPU_LZCNT )
|
||||||
|
{
|
||||||
|
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
|
||||||
|
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt;
|
||||||
|
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cpu&X264_CPU_SSSE3 )
|
||||||
|
{
|
||||||
|
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
|
||||||
|
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3;
|
||||||
|
if( cpu&X264_CPU_LZCNT )
|
||||||
|
{
|
||||||
|
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt;
|
||||||
|
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cpu&X264_CPU_AVX2 )
|
||||||
|
{
|
||||||
|
pf->nal_escape = x264_nal_escape_avx2;
|
||||||
|
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cpu&X264_CPU_AVX512 )
|
||||||
|
{
|
||||||
|
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
|
||||||
|
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
|
||||||
|
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
if( cpu&X264_CPU_NEON )
|
||||||
|
pf->nal_escape = x264_nal_escape_neon;
|
||||||
|
#endif
|
||||||
|
#if HAVE_AARCH64
|
||||||
|
if( cpu&X264_CPU_NEON )
|
||||||
|
pf->nal_escape = x264_nal_escape_neon;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
309
common/bitstream.h
Normal file
309
common/bitstream.h
Normal file
@@ -0,0 +1,309 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* bitstream.h: bitstream writing
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
* Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_BS_H
|
||||||
|
#define X264_BS_H
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
uint16_t i_bits;
|
||||||
|
uint8_t i_size;
|
||||||
|
/* Next level table to use */
|
||||||
|
uint8_t i_next;
|
||||||
|
} vlc_large_t;
|
||||||
|
|
||||||
|
typedef struct bs_s
|
||||||
|
{
|
||||||
|
uint8_t *p_start;
|
||||||
|
uint8_t *p;
|
||||||
|
uint8_t *p_end;
|
||||||
|
|
||||||
|
uintptr_t cur_bits;
|
||||||
|
int i_left; /* i_count number of available bits */
|
||||||
|
int i_bits_encoded; /* RD only */
|
||||||
|
} bs_t;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int32_t last;
|
||||||
|
int32_t mask;
|
||||||
|
ALIGNED_16( dctcoef level[18] );
|
||||||
|
} x264_run_level_t;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
uint8_t *(*nal_escape)( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||||
|
void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced,
|
||||||
|
intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||||
|
void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced,
|
||||||
|
intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||||
|
void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced,
|
||||||
|
intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||||
|
} x264_bitstream_function_t;
|
||||||
|
|
||||||
|
#define x264_bitstream_init x264_template(bitstream_init)
|
||||||
|
void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf );
|
||||||
|
|
||||||
|
/* A larger level table size theoretically could help a bit at extremely
|
||||||
|
* high bitrates, but the cost in cache is usually too high for it to be
|
||||||
|
* useful.
|
||||||
|
* This size appears to be optimal for QP18 encoding on a Nehalem CPU.
|
||||||
|
* FIXME: Do further testing? */
|
||||||
|
#define LEVEL_TABLE_SIZE 128
|
||||||
|
#define x264_level_token x264_template(level_token)
|
||||||
|
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
|
||||||
|
|
||||||
|
/* The longest possible set of zero run codes sums to 25 bits. This leaves
|
||||||
|
* plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
|
||||||
|
|
||||||
|
#define x264_run_before x264_template(run_before)
|
||||||
|
extern uint32_t x264_run_before[1<<16];
|
||||||
|
|
||||||
|
static inline void bs_init( bs_t *s, void *p_data, int i_data )
|
||||||
|
{
|
||||||
|
int offset = ((intptr_t)p_data & 3);
|
||||||
|
s->p = s->p_start = (uint8_t*)p_data - offset;
|
||||||
|
s->p_end = (uint8_t*)p_data + i_data;
|
||||||
|
s->i_left = (WORD_SIZE - offset)*8;
|
||||||
|
if( offset )
|
||||||
|
{
|
||||||
|
s->cur_bits = endian_fix32( M32(s->p) );
|
||||||
|
s->cur_bits >>= (4-offset)*8;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
s->cur_bits = 0;
|
||||||
|
}
|
||||||
|
static inline int bs_pos( bs_t *s )
|
||||||
|
{
|
||||||
|
return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
|
||||||
|
static inline void bs_flush( bs_t *s )
|
||||||
|
{
|
||||||
|
M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
|
||||||
|
s->p += WORD_SIZE - (s->i_left >> 3);
|
||||||
|
s->i_left = WORD_SIZE*8;
|
||||||
|
}
|
||||||
|
/* The inverse of bs_flush: prepare the bitstream to be written to again. */
|
||||||
|
static inline void bs_realign( bs_t *s )
|
||||||
|
{
|
||||||
|
int offset = ((intptr_t)s->p & 3);
|
||||||
|
if( offset )
|
||||||
|
{
|
||||||
|
s->p = (uint8_t*)s->p - offset;
|
||||||
|
s->i_left = (WORD_SIZE - offset)*8;
|
||||||
|
s->cur_bits = endian_fix32( M32(s->p) );
|
||||||
|
s->cur_bits >>= (4-offset)*8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
|
||||||
|
{
|
||||||
|
if( WORD_SIZE == 8 )
|
||||||
|
{
|
||||||
|
s->cur_bits = (s->cur_bits << i_count) | i_bits;
|
||||||
|
s->i_left -= i_count;
|
||||||
|
if( s->i_left <= 32 )
|
||||||
|
{
|
||||||
|
#if WORDS_BIGENDIAN
|
||||||
|
M32( s->p ) = s->cur_bits >> (32 - s->i_left);
|
||||||
|
#else
|
||||||
|
M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
|
||||||
|
#endif
|
||||||
|
s->i_left += 32;
|
||||||
|
s->p += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( i_count < s->i_left )
|
||||||
|
{
|
||||||
|
s->cur_bits = (s->cur_bits << i_count) | i_bits;
|
||||||
|
s->i_left -= i_count;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
i_count -= s->i_left;
|
||||||
|
s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
|
||||||
|
M32( s->p ) = endian_fix( s->cur_bits );
|
||||||
|
s->p += 4;
|
||||||
|
s->cur_bits = i_bits;
|
||||||
|
s->i_left = 32 - i_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Special case to eliminate branch in normal bs_write. */
|
||||||
|
/* Golomb never writes an even-size code, so this is only used in slice headers. */
|
||||||
|
static inline void bs_write32( bs_t *s, uint32_t i_bits )
|
||||||
|
{
|
||||||
|
bs_write( s, 16, i_bits >> 16 );
|
||||||
|
bs_write( s, 16, i_bits );
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void bs_write1( bs_t *s, uint32_t i_bit )
|
||||||
|
{
|
||||||
|
s->cur_bits <<= 1;
|
||||||
|
s->cur_bits |= i_bit;
|
||||||
|
s->i_left--;
|
||||||
|
if( s->i_left == WORD_SIZE*8-32 )
|
||||||
|
{
|
||||||
|
M32( s->p ) = endian_fix32( s->cur_bits );
|
||||||
|
s->p += 4;
|
||||||
|
s->i_left = WORD_SIZE*8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void bs_align_0( bs_t *s )
|
||||||
|
{
|
||||||
|
bs_write( s, s->i_left&7, 0 );
|
||||||
|
bs_flush( s );
|
||||||
|
}
|
||||||
|
static inline void bs_align_1( bs_t *s )
|
||||||
|
{
|
||||||
|
bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
|
||||||
|
bs_flush( s );
|
||||||
|
}
|
||||||
|
static inline void bs_align_10( bs_t *s )
|
||||||
|
{
|
||||||
|
if( s->i_left&7 )
|
||||||
|
bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
|
||||||
|
bs_flush( s );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* golomb functions */
|
||||||
|
|
||||||
|
static const uint8_t x264_ue_size_tab[256] =
|
||||||
|
{
|
||||||
|
1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
|
||||||
|
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||||
|
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
|
||||||
|
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
|
||||||
|
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
||||||
|
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
||||||
|
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
||||||
|
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void bs_write_ue_big( bs_t *s, unsigned int val )
|
||||||
|
{
|
||||||
|
int size = 0;
|
||||||
|
int tmp = ++val;
|
||||||
|
if( tmp >= 0x10000 )
|
||||||
|
{
|
||||||
|
size = 32;
|
||||||
|
tmp >>= 16;
|
||||||
|
}
|
||||||
|
if( tmp >= 0x100 )
|
||||||
|
{
|
||||||
|
size += 16;
|
||||||
|
tmp >>= 8;
|
||||||
|
}
|
||||||
|
size += x264_ue_size_tab[tmp];
|
||||||
|
bs_write( s, size>>1, 0 );
|
||||||
|
bs_write( s, (size>>1)+1, val );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Only works on values under 255. */
|
||||||
|
static inline void bs_write_ue( bs_t *s, int val )
|
||||||
|
{
|
||||||
|
bs_write( s, x264_ue_size_tab[val+1], val+1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void bs_write_se( bs_t *s, int val )
|
||||||
|
{
|
||||||
|
int size = 0;
|
||||||
|
/* Faster than (val <= 0 ? -val*2+1 : val*2) */
|
||||||
|
/* 4 instructions on x86, 3 on ARM */
|
||||||
|
int tmp = 1 - val*2;
|
||||||
|
if( tmp < 0 ) tmp = val*2;
|
||||||
|
val = tmp;
|
||||||
|
|
||||||
|
if( tmp >= 0x100 )
|
||||||
|
{
|
||||||
|
size = 16;
|
||||||
|
tmp >>= 8;
|
||||||
|
}
|
||||||
|
size += x264_ue_size_tab[tmp];
|
||||||
|
bs_write( s, size, val );
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void bs_write_te( bs_t *s, int x, int val )
|
||||||
|
{
|
||||||
|
if( x == 1 )
|
||||||
|
bs_write1( s, 1^val );
|
||||||
|
else //if( x > 1 )
|
||||||
|
bs_write_ue( s, val );
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void bs_rbsp_trailing( bs_t *s )
|
||||||
|
{
|
||||||
|
bs_write1( s, 1 );
|
||||||
|
bs_write( s, s->i_left&7, 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int bs_size_ue( unsigned int val )
|
||||||
|
{
|
||||||
|
return x264_ue_size_tab[val+1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
|
||||||
|
{
|
||||||
|
if( val < 255 )
|
||||||
|
return x264_ue_size_tab[val+1];
|
||||||
|
else
|
||||||
|
return x264_ue_size_tab[(val+1)>>8] + 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int bs_size_se( int val )
|
||||||
|
{
|
||||||
|
int tmp = 1 - val*2;
|
||||||
|
if( tmp < 0 ) tmp = val*2;
|
||||||
|
if( tmp < 256 )
|
||||||
|
return x264_ue_size_tab[tmp];
|
||||||
|
else
|
||||||
|
return x264_ue_size_tab[tmp>>8]+16;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int bs_size_te( int x, int val )
|
||||||
|
{
|
||||||
|
if( x == 1 )
|
||||||
|
return 1;
|
||||||
|
else //if( x > 1 )
|
||||||
|
return x264_ue_size_tab[val+1];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
184
common/cabac.c
Normal file
184
common/cabac.c
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* cabac.c: arithmetic coder
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
static uint8_t cabac_contexts[4][QP_MAX_SPEC+1][1024];
|
||||||
|
|
||||||
|
void x264_cabac_init( x264_t *h )
|
||||||
|
{
|
||||||
|
int ctx_count = CHROMA444 ? 1024 : 460;
|
||||||
|
for( int i = 0; i < 4; i++ )
|
||||||
|
{
|
||||||
|
const int8_t (*cabac_context_init)[1024][2] = i == 0 ? &x264_cabac_context_init_I
|
||||||
|
: &x264_cabac_context_init_PB[i-1];
|
||||||
|
for( int qp = 0; qp <= QP_MAX_SPEC; qp++ )
|
||||||
|
for( int j = 0; j < ctx_count; j++ )
|
||||||
|
{
|
||||||
|
int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 );
|
||||||
|
cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
|
||||||
|
{
|
||||||
|
memcpy( cb->state, cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], CHROMA444 ? 1024 : 460 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_cabac_encode_init_core( x264_cabac_t *cb )
|
||||||
|
{
|
||||||
|
cb->i_low = 0;
|
||||||
|
cb->i_range = 0x01FE;
|
||||||
|
cb->i_queue = -9; // the first bit will be shifted away and not written
|
||||||
|
cb->i_bytes_outstanding = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
|
||||||
|
{
|
||||||
|
x264_cabac_encode_init_core( cb );
|
||||||
|
cb->p_start = p_data;
|
||||||
|
cb->p = p_data;
|
||||||
|
cb->p_end = p_end;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void cabac_putbyte( x264_cabac_t *cb )
|
||||||
|
{
|
||||||
|
if( cb->i_queue >= 0 )
|
||||||
|
{
|
||||||
|
int out = cb->i_low >> (cb->i_queue+10);
|
||||||
|
cb->i_low &= (0x400<<cb->i_queue)-1;
|
||||||
|
cb->i_queue -= 8;
|
||||||
|
|
||||||
|
if( (out & 0xff) == 0xff )
|
||||||
|
cb->i_bytes_outstanding++;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int carry = out >> 8;
|
||||||
|
int bytes_outstanding = cb->i_bytes_outstanding;
|
||||||
|
// this can't modify before the beginning of the stream because
|
||||||
|
// that would correspond to a probability > 1.
|
||||||
|
// it will write before the beginning of the stream, which is ok
|
||||||
|
// because a slice header always comes before cabac data.
|
||||||
|
// this can't carry beyond the one byte, because any 0xff bytes
|
||||||
|
// are in bytes_outstanding and thus not written yet.
|
||||||
|
cb->p[-1] += carry;
|
||||||
|
while( bytes_outstanding > 0 )
|
||||||
|
{
|
||||||
|
*(cb->p++) = (uint8_t)(carry-1);
|
||||||
|
bytes_outstanding--;
|
||||||
|
}
|
||||||
|
*(cb->p++) = (uint8_t)out;
|
||||||
|
cb->i_bytes_outstanding = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void cabac_encode_renorm( x264_cabac_t *cb )
|
||||||
|
{
|
||||||
|
int shift = x264_cabac_renorm_shift[cb->i_range>>3];
|
||||||
|
cb->i_range <<= shift;
|
||||||
|
cb->i_low <<= shift;
|
||||||
|
cb->i_queue += shift;
|
||||||
|
cabac_putbyte( cb );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Making custom versions of this function, even in asm, for the cases where
|
||||||
|
* b is known to be 0 or 1, proved to be somewhat useful on x86_32 with GCC 3.4
|
||||||
|
* but nearly useless with GCC 4.3 and worse than useless on x86_64. */
|
||||||
|
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
|
||||||
|
{
|
||||||
|
int i_state = cb->state[i_ctx];
|
||||||
|
int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
|
||||||
|
cb->i_range -= i_range_lps;
|
||||||
|
if( b != (i_state & 1) )
|
||||||
|
{
|
||||||
|
cb->i_low += cb->i_range;
|
||||||
|
cb->i_range = i_range_lps;
|
||||||
|
}
|
||||||
|
cb->state[i_ctx] = x264_cabac_transition[i_state][b];
|
||||||
|
cabac_encode_renorm( cb );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Note: b is negated for this function */
|
||||||
|
void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
|
||||||
|
{
|
||||||
|
cb->i_low <<= 1;
|
||||||
|
cb->i_low += b & cb->i_range;
|
||||||
|
cb->i_queue += 1;
|
||||||
|
cabac_putbyte( cb );
|
||||||
|
}
|
||||||
|
|
||||||
|
static const int bypass_lut[16] =
|
||||||
|
{
|
||||||
|
-1, 0x2, 0x14, 0x68, 0x1d0, 0x7a0, 0x1f40, 0x7e80,
|
||||||
|
0x1fd00, 0x7fa00, 0x1ff400, 0x7fe800, 0x1ffd000, 0x7ffa000, 0x1fff4000, 0x7ffe8000
|
||||||
|
};
|
||||||
|
|
||||||
|
void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
|
||||||
|
{
|
||||||
|
uint32_t v = val + (1<<exp_bits);
|
||||||
|
int k = 31 - x264_clz( v );
|
||||||
|
uint32_t x = ((uint32_t)bypass_lut[k-exp_bits]<<exp_bits) + v;
|
||||||
|
k = 2*k+1-exp_bits;
|
||||||
|
int i = ((k-1)&7)+1;
|
||||||
|
do {
|
||||||
|
k -= i;
|
||||||
|
cb->i_low <<= i;
|
||||||
|
cb->i_low += ((x>>k)&0xff) * cb->i_range;
|
||||||
|
cb->i_queue += i;
|
||||||
|
cabac_putbyte( cb );
|
||||||
|
i = 8;
|
||||||
|
} while( k > 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
|
||||||
|
{
|
||||||
|
cb->i_range -= 2;
|
||||||
|
cabac_encode_renorm( cb );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb )
|
||||||
|
{
|
||||||
|
cb->i_low += cb->i_range - 2;
|
||||||
|
cb->i_low |= 1;
|
||||||
|
cb->i_low <<= 9;
|
||||||
|
cb->i_queue += 9;
|
||||||
|
cabac_putbyte( cb );
|
||||||
|
cabac_putbyte( cb );
|
||||||
|
cb->i_low <<= -cb->i_queue;
|
||||||
|
cb->i_low |= (0x35a4e4f5 >> (h->i_frame & 31) & 1) << 10;
|
||||||
|
cb->i_queue = 0;
|
||||||
|
cabac_putbyte( cb );
|
||||||
|
|
||||||
|
while( cb->i_bytes_outstanding > 0 )
|
||||||
|
{
|
||||||
|
*(cb->p++) = 0xff;
|
||||||
|
cb->i_bytes_outstanding--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
126
common/cabac.h
Normal file
126
common/cabac.h
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* cabac.h: arithmetic coder
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_CABAC_H
|
||||||
|
#define X264_CABAC_H
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
/* state */
|
||||||
|
int i_low;
|
||||||
|
int i_range;
|
||||||
|
|
||||||
|
/* bit stream */
|
||||||
|
int i_queue; //stored with an offset of -8 for faster asm
|
||||||
|
int i_bytes_outstanding;
|
||||||
|
|
||||||
|
uint8_t *p_start;
|
||||||
|
uint8_t *p;
|
||||||
|
uint8_t *p_end;
|
||||||
|
|
||||||
|
/* aligned for memcpy_aligned starting here */
|
||||||
|
ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
|
||||||
|
|
||||||
|
/* context */
|
||||||
|
uint8_t state[1024];
|
||||||
|
|
||||||
|
/* for 16-byte alignment */
|
||||||
|
uint8_t padding[12];
|
||||||
|
} x264_cabac_t;
|
||||||
|
|
||||||
|
/* init the contexts given i_slice_type, the quantif and the model */
|
||||||
|
#define x264_cabac_context_init x264_template(cabac_context_init)
|
||||||
|
void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
|
||||||
|
|
||||||
|
#define x264_cabac_encode_init_core x264_template(cabac_encode_init_core)
|
||||||
|
void x264_cabac_encode_init_core( x264_cabac_t *cb );
|
||||||
|
#define x264_cabac_encode_init x264_template(cabac_encode_init)
|
||||||
|
void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
|
||||||
|
#define x264_cabac_encode_decision_c x264_template(cabac_encode_decision_c)
|
||||||
|
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
|
||||||
|
#define x264_cabac_encode_decision_asm x264_template(cabac_encode_decision_asm)
|
||||||
|
void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
|
||||||
|
#define x264_cabac_encode_bypass_c x264_template(cabac_encode_bypass_c)
|
||||||
|
void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
|
||||||
|
#define x264_cabac_encode_bypass_asm x264_template(cabac_encode_bypass_asm)
|
||||||
|
void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
|
||||||
|
#define x264_cabac_encode_terminal_c x264_template(cabac_encode_terminal_c)
|
||||||
|
void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
|
||||||
|
#define x264_cabac_encode_terminal_asm x264_template(cabac_encode_terminal_asm)
|
||||||
|
void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
|
||||||
|
#define x264_cabac_encode_ue_bypass x264_template(cabac_encode_ue_bypass)
|
||||||
|
void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
|
||||||
|
#define x264_cabac_encode_flush x264_template(cabac_encode_flush)
|
||||||
|
void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
|
||||||
|
|
||||||
|
#if HAVE_MMX
|
||||||
|
#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
|
||||||
|
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
|
||||||
|
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
|
||||||
|
#elif HAVE_AARCH64
|
||||||
|
#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
|
||||||
|
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
|
||||||
|
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
|
||||||
|
#else
|
||||||
|
#define x264_cabac_encode_decision x264_cabac_encode_decision_c
|
||||||
|
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
|
||||||
|
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
|
||||||
|
#endif
|
||||||
|
#define x264_cabac_encode_decision_noup x264_cabac_encode_decision
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_cabac_pos( x264_cabac_t *cb )
|
||||||
|
{
|
||||||
|
return (cb->p - cb->p_start + cb->i_bytes_outstanding) * 8 + cb->i_queue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* internal only. these don't write the bitstream, just calculate bit cost: */
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx, long b )
|
||||||
|
{
|
||||||
|
int i_state = cb->state[i_ctx];
|
||||||
|
cb->state[i_ctx] = x264_cabac_transition[i_state][b];
|
||||||
|
cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
|
||||||
|
{
|
||||||
|
int i_state = *state;
|
||||||
|
*state = x264_cabac_transition[i_state][b];
|
||||||
|
return x264_cabac_entropy[i_state^b];
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
|
||||||
|
{
|
||||||
|
int i_state = cb->state[i_ctx];
|
||||||
|
cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
|
||||||
|
{
|
||||||
|
return x264_cabac_entropy[*state^b];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
44
common/common.c
Normal file
44
common/common.c
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* common.c: misc common functions
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* x264_log:
|
||||||
|
****************************************************************************/
|
||||||
|
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
|
||||||
|
{
|
||||||
|
if( !h || i_level <= h->param.i_log_level )
|
||||||
|
{
|
||||||
|
va_list arg;
|
||||||
|
va_start( arg, psz_fmt );
|
||||||
|
if( !h )
|
||||||
|
x264_log_default( NULL, i_level, psz_fmt, arg );
|
||||||
|
else
|
||||||
|
h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
|
||||||
|
va_end( arg );
|
||||||
|
}
|
||||||
|
}
|
||||||
813
common/common.h
Normal file
813
common/common.h
Normal file
@@ -0,0 +1,813 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* common.h: misc common functions
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_COMMON_H
|
||||||
|
#define X264_COMMON_H
|
||||||
|
|
||||||
|
#include "base.h"
|
||||||
|
|
||||||
|
/* Macros for templating function calls according to bit depth */
|
||||||
|
#define x264_template(w) x264_glue3(x264, BIT_DEPTH, w)
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* API Templates
|
||||||
|
****************************************************************************/
|
||||||
|
#define x264_nal_encode x264_template(nal_encode)
|
||||||
|
#define x264_encoder_reconfig x264_template(encoder_reconfig)
|
||||||
|
#define x264_encoder_parameters x264_template(encoder_parameters)
|
||||||
|
#define x264_encoder_headers x264_template(encoder_headers)
|
||||||
|
#define x264_encoder_encode x264_template(encoder_encode)
|
||||||
|
#define x264_encoder_close x264_template(encoder_close)
|
||||||
|
#define x264_encoder_delayed_frames x264_template(encoder_delayed_frames)
|
||||||
|
#define x264_encoder_maximum_delayed_frames x264_template(encoder_maximum_delayed_frames)
|
||||||
|
#define x264_encoder_intra_refresh x264_template(encoder_intra_refresh)
|
||||||
|
#define x264_encoder_invalidate_reference x264_template(encoder_invalidate_reference)
|
||||||
|
|
||||||
|
/* This undef allows to rename the external symbol and force link failure in case
|
||||||
|
* of incompatible libraries. Then the define enables templating as above. */
|
||||||
|
#undef x264_encoder_open
|
||||||
|
#define x264_encoder_open x264_template(encoder_open)
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Macros
|
||||||
|
****************************************************************************/
|
||||||
|
#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
|
||||||
|
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
|
||||||
|
#define QP_MAX_SPEC (51+QP_BD_OFFSET)
|
||||||
|
#define QP_MAX (QP_MAX_SPEC+18)
|
||||||
|
#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
|
||||||
|
// arbitrary, but low because SATD scores are 1/4 normal
|
||||||
|
#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
|
||||||
|
#define SPEC_QP(x) X264_MIN((x), QP_MAX_SPEC)
|
||||||
|
|
||||||
|
#define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
|
||||||
|
#define FILLER_OVERHEAD (NALU_OVERHEAD+1)
|
||||||
|
#define SEI_OVERHEAD (NALU_OVERHEAD - (h->param.b_annexb && !h->param.i_avcintra_class && (h->out.i_nal-1)))
|
||||||
|
|
||||||
|
#if HAVE_INTERLACED
|
||||||
|
# define MB_INTERLACED h->mb.b_interlaced
|
||||||
|
# define SLICE_MBAFF h->sh.b_mbaff
|
||||||
|
# define PARAM_INTERLACED h->param.b_interlaced
|
||||||
|
#else
|
||||||
|
# define MB_INTERLACED 0
|
||||||
|
# define SLICE_MBAFF 0
|
||||||
|
# define PARAM_INTERLACED 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef CHROMA_FORMAT
|
||||||
|
# define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422)
|
||||||
|
# define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420)
|
||||||
|
#else
|
||||||
|
# define CHROMA_FORMAT h->sps->i_chroma_format_idc
|
||||||
|
# define CHROMA_H_SHIFT h->mb.chroma_h_shift
|
||||||
|
# define CHROMA_V_SHIFT h->mb.chroma_v_shift
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CHROMA_SIZE(s) (CHROMA_FORMAT ? (s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT) : 0)
|
||||||
|
#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
|
||||||
|
#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
|
||||||
|
|
||||||
|
#if HIGH_BIT_DEPTH
|
||||||
|
typedef uint16_t pixel;
|
||||||
|
typedef uint64_t pixel4;
|
||||||
|
typedef int32_t dctcoef;
|
||||||
|
typedef uint32_t udctcoef;
|
||||||
|
|
||||||
|
# define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
|
||||||
|
# define MPIXEL_X4(src) M64(src)
|
||||||
|
#else
|
||||||
|
typedef uint8_t pixel;
|
||||||
|
typedef uint32_t pixel4;
|
||||||
|
typedef int16_t dctcoef;
|
||||||
|
typedef uint16_t udctcoef;
|
||||||
|
|
||||||
|
# define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
|
||||||
|
# define MPIXEL_X4(src) M32(src)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SIZEOF_PIXEL ((int)sizeof(pixel))
|
||||||
|
|
||||||
|
#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Includes
|
||||||
|
****************************************************************************/
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
#include "opencl.h"
|
||||||
|
#endif
|
||||||
|
#include "cabac.h"
|
||||||
|
#include "bitstream.h"
|
||||||
|
#include "set.h"
|
||||||
|
#include "predict.h"
|
||||||
|
#include "pixel.h"
|
||||||
|
#include "mc.h"
|
||||||
|
#include "frame.h"
|
||||||
|
#include "dct.h"
|
||||||
|
#include "quant.h"
|
||||||
|
#include "threadpool.h"
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* General functions
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
/* log */
|
||||||
|
#define x264_log x264_template(log)
|
||||||
|
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
|
||||||
|
|
||||||
|
#define x264_cavlc_init x264_template(cavlc_init)
|
||||||
|
void x264_cavlc_init( x264_t *h );
|
||||||
|
#define x264_cabac_init x264_template(cabac_init)
|
||||||
|
void x264_cabac_init( x264_t *h );
|
||||||
|
|
||||||
|
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
|
||||||
|
{
|
||||||
|
return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
****************************************************************************/
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
x264_sps_t *sps;
|
||||||
|
x264_pps_t *pps;
|
||||||
|
|
||||||
|
int i_type;
|
||||||
|
int i_first_mb;
|
||||||
|
int i_last_mb;
|
||||||
|
|
||||||
|
int i_pps_id;
|
||||||
|
|
||||||
|
int i_frame_num;
|
||||||
|
|
||||||
|
int b_mbaff;
|
||||||
|
int b_field_pic;
|
||||||
|
int b_bottom_field;
|
||||||
|
|
||||||
|
int i_idr_pic_id; /* -1 if nal_type != 5 */
|
||||||
|
|
||||||
|
int i_poc;
|
||||||
|
int i_delta_poc_bottom;
|
||||||
|
|
||||||
|
int i_delta_poc[2];
|
||||||
|
int i_redundant_pic_cnt;
|
||||||
|
|
||||||
|
int b_direct_spatial_mv_pred;
|
||||||
|
|
||||||
|
int b_num_ref_idx_override;
|
||||||
|
int i_num_ref_idx_l0_active;
|
||||||
|
int i_num_ref_idx_l1_active;
|
||||||
|
|
||||||
|
int b_ref_pic_list_reordering[2];
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
int idc;
|
||||||
|
int arg;
|
||||||
|
} ref_pic_list_order[2][X264_REF_MAX];
|
||||||
|
|
||||||
|
/* P-frame weighting */
|
||||||
|
int b_weighted_pred;
|
||||||
|
x264_weight_t weight[X264_REF_MAX*2][3];
|
||||||
|
|
||||||
|
int i_mmco_remove_from_end;
|
||||||
|
int i_mmco_command_count;
|
||||||
|
struct /* struct for future expansion */
|
||||||
|
{
|
||||||
|
int i_difference_of_pic_nums;
|
||||||
|
int i_poc;
|
||||||
|
} mmco[X264_REF_MAX];
|
||||||
|
|
||||||
|
int i_cabac_init_idc;
|
||||||
|
|
||||||
|
int i_qp;
|
||||||
|
int i_qp_delta;
|
||||||
|
int b_sp_for_swidth;
|
||||||
|
int i_qs_delta;
|
||||||
|
|
||||||
|
/* deblocking filter */
|
||||||
|
int i_disable_deblocking_filter_idc;
|
||||||
|
int i_alpha_c0_offset;
|
||||||
|
int i_beta_offset;
|
||||||
|
|
||||||
|
} x264_slice_header_t;
|
||||||
|
|
||||||
|
typedef struct x264_lookahead_t
|
||||||
|
{
|
||||||
|
volatile uint8_t b_exit_thread;
|
||||||
|
uint8_t b_thread_active;
|
||||||
|
uint8_t b_analyse_keyframe;
|
||||||
|
int i_last_keyframe;
|
||||||
|
int i_slicetype_length;
|
||||||
|
x264_frame_t *last_nonb;
|
||||||
|
x264_pthread_t thread_handle;
|
||||||
|
x264_sync_frame_list_t ifbuf;
|
||||||
|
x264_sync_frame_list_t next;
|
||||||
|
x264_sync_frame_list_t ofbuf;
|
||||||
|
} x264_lookahead_t;
|
||||||
|
|
||||||
|
typedef struct x264_ratecontrol_t x264_ratecontrol_t;
|
||||||
|
|
||||||
|
typedef struct x264_left_table_t
|
||||||
|
{
|
||||||
|
uint8_t intra[4];
|
||||||
|
uint8_t nnz[4];
|
||||||
|
uint8_t nnz_chroma[4];
|
||||||
|
uint8_t mv[4];
|
||||||
|
uint8_t ref[4];
|
||||||
|
} x264_left_table_t;
|
||||||
|
|
||||||
|
/* Current frame stats */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
/* MV bits (MV+Ref+Block Type) */
|
||||||
|
int i_mv_bits;
|
||||||
|
/* Texture bits (DCT coefs) */
|
||||||
|
int i_tex_bits;
|
||||||
|
/* ? */
|
||||||
|
int i_misc_bits;
|
||||||
|
/* MB type counts */
|
||||||
|
int i_mb_count[19];
|
||||||
|
int i_mb_count_i;
|
||||||
|
int i_mb_count_p;
|
||||||
|
int i_mb_count_skip;
|
||||||
|
int i_mb_count_8x8dct[2];
|
||||||
|
int i_mb_count_ref[2][X264_REF_MAX*2];
|
||||||
|
int i_mb_partition[17];
|
||||||
|
int i_mb_cbp[6];
|
||||||
|
int i_mb_pred_mode[4][13];
|
||||||
|
int i_mb_field[3];
|
||||||
|
/* Adaptive direct mv pred */
|
||||||
|
int i_direct_score[2];
|
||||||
|
/* Metrics */
|
||||||
|
int64_t i_ssd[3];
|
||||||
|
double f_ssim;
|
||||||
|
int i_ssim_cnt;
|
||||||
|
} x264_frame_stat_t;
|
||||||
|
|
||||||
|
struct x264_t
|
||||||
|
{
|
||||||
|
/* encoder parameters */
|
||||||
|
x264_param_t param;
|
||||||
|
/* opaque pointer to bit depth independent interface */
|
||||||
|
void *api;
|
||||||
|
|
||||||
|
x264_t *thread[X264_THREAD_MAX+1];
|
||||||
|
x264_t *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
|
||||||
|
int b_thread_active;
|
||||||
|
int i_thread_phase; /* which thread to use for the next frame */
|
||||||
|
int i_thread_idx; /* which thread this is */
|
||||||
|
int i_threadslice_start; /* first row in this thread slice */
|
||||||
|
int i_threadslice_end; /* row after the end of this thread slice */
|
||||||
|
int i_threadslice_pass; /* which pass of encoding we are on */
|
||||||
|
x264_threadpool_t *threadpool;
|
||||||
|
x264_threadpool_t *lookaheadpool;
|
||||||
|
x264_pthread_mutex_t mutex;
|
||||||
|
x264_pthread_cond_t cv;
|
||||||
|
|
||||||
|
/* bitstream output */
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
int i_nal;
|
||||||
|
int i_nals_allocated;
|
||||||
|
x264_nal_t *nal;
|
||||||
|
int i_bitstream; /* size of p_bitstream */
|
||||||
|
uint8_t *p_bitstream; /* will hold data for all nal */
|
||||||
|
bs_t bs;
|
||||||
|
} out;
|
||||||
|
|
||||||
|
uint8_t *nal_buffer;
|
||||||
|
int nal_buffer_size;
|
||||||
|
|
||||||
|
x264_t *reconfig_h;
|
||||||
|
int reconfig;
|
||||||
|
|
||||||
|
/**** thread synchronization starts here ****/
|
||||||
|
|
||||||
|
/* frame number/poc */
|
||||||
|
int i_frame;
|
||||||
|
int i_frame_num;
|
||||||
|
|
||||||
|
int i_thread_frames; /* Number of different frames being encoded by threads;
|
||||||
|
* 1 when sliced-threads is on. */
|
||||||
|
int i_nal_type;
|
||||||
|
int i_nal_ref_idc;
|
||||||
|
|
||||||
|
int64_t i_disp_fields; /* Number of displayed fields (both coded and implied via pic_struct) */
|
||||||
|
int i_disp_fields_last_frame;
|
||||||
|
int64_t i_prev_duration; /* Duration of previous frame */
|
||||||
|
int64_t i_coded_fields; /* Number of coded fields (both coded and implied via pic_struct) */
|
||||||
|
int64_t i_cpb_delay; /* Equal to number of fields preceding this field
|
||||||
|
* since last buffering_period SEI */
|
||||||
|
int64_t i_coded_fields_lookahead; /* Use separate counters for lookahead */
|
||||||
|
int64_t i_cpb_delay_lookahead;
|
||||||
|
|
||||||
|
int64_t i_cpb_delay_pir_offset;
|
||||||
|
int64_t i_cpb_delay_pir_offset_next;
|
||||||
|
|
||||||
|
int b_queued_intra_refresh;
|
||||||
|
int64_t i_last_idr_pts;
|
||||||
|
|
||||||
|
int i_idr_pic_id;
|
||||||
|
|
||||||
|
/* quantization matrix for decoding, [cqm][qp%6][coef] */
|
||||||
|
int (*dequant4_mf[4])[16]; /* [4][6][16] */
|
||||||
|
int (*dequant8_mf[4])[64]; /* [4][6][64] */
|
||||||
|
/* quantization matrix for trellis, [cqm][qp][coef] */
|
||||||
|
int (*unquant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
|
||||||
|
int (*unquant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
|
||||||
|
/* quantization matrix for deadzone */
|
||||||
|
udctcoef (*quant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
|
||||||
|
udctcoef (*quant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
|
||||||
|
udctcoef (*quant4_bias[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
|
||||||
|
udctcoef (*quant8_bias[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
|
||||||
|
udctcoef (*quant4_bias0[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
|
||||||
|
udctcoef (*quant8_bias0[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
|
||||||
|
udctcoef (*nr_offset_emergency)[4][64];
|
||||||
|
|
||||||
|
/* mv/ref/mode cost arrays. */
|
||||||
|
uint16_t *cost_mv[QP_MAX+1];
|
||||||
|
uint16_t *cost_mv_fpel[QP_MAX+1][4];
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
uint16_t ref[QP_MAX+1][3][33];
|
||||||
|
uint16_t i4x4_mode[QP_MAX+1][17];
|
||||||
|
} *cost_table;
|
||||||
|
|
||||||
|
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
|
||||||
|
|
||||||
|
/* Slice header */
|
||||||
|
x264_slice_header_t sh;
|
||||||
|
|
||||||
|
/* SPS / PPS */
|
||||||
|
x264_sps_t sps[1];
|
||||||
|
x264_pps_t pps[1];
|
||||||
|
|
||||||
|
/* Slice header backup, for SEI_DEC_REF_PIC_MARKING */
|
||||||
|
int b_sh_backup;
|
||||||
|
x264_slice_header_t sh_backup;
|
||||||
|
|
||||||
|
/* cabac context */
|
||||||
|
x264_cabac_t cabac;
|
||||||
|
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
/* Frames to be encoded (whose types have been decided) */
|
||||||
|
x264_frame_t **current;
|
||||||
|
/* Unused frames: 0 = fenc, 1 = fdec */
|
||||||
|
x264_frame_t **unused[2];
|
||||||
|
|
||||||
|
/* Unused blank frames (for duplicates) */
|
||||||
|
x264_frame_t **blank_unused;
|
||||||
|
|
||||||
|
/* frames used for reference + sentinels */
|
||||||
|
x264_frame_t *reference[X264_REF_MAX+2];
|
||||||
|
|
||||||
|
int i_last_keyframe; /* Frame number of the last keyframe */
|
||||||
|
int i_last_idr; /* Frame number of the last IDR (not RP)*/
|
||||||
|
int i_poc_last_open_gop; /* Poc of the I frame of the last open-gop. The value
|
||||||
|
* is only assigned during the period between that
|
||||||
|
* I frame and the next P or I frame, else -1 */
|
||||||
|
|
||||||
|
int i_input; /* Number of input frames already accepted */
|
||||||
|
|
||||||
|
int i_max_dpb; /* Number of frames allocated in the decoded picture buffer */
|
||||||
|
int i_max_ref0;
|
||||||
|
int i_max_ref1;
|
||||||
|
int i_delay; /* Number of frames buffered for B reordering */
|
||||||
|
int i_bframe_delay;
|
||||||
|
int64_t i_bframe_delay_time;
|
||||||
|
int64_t i_first_pts;
|
||||||
|
int64_t i_prev_reordered_pts[2];
|
||||||
|
int64_t i_largest_pts;
|
||||||
|
int64_t i_second_largest_pts;
|
||||||
|
int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */
|
||||||
|
int b_have_sub8x8_esa;
|
||||||
|
} frames;
|
||||||
|
|
||||||
|
/* current frame being encoded */
|
||||||
|
x264_frame_t *fenc;
|
||||||
|
|
||||||
|
/* frame being reconstructed */
|
||||||
|
x264_frame_t *fdec;
|
||||||
|
|
||||||
|
/* references lists */
|
||||||
|
int i_ref[2];
|
||||||
|
x264_frame_t *fref[2][X264_REF_MAX+3];
|
||||||
|
x264_frame_t *fref_nearest[2];
|
||||||
|
int b_ref_reorder[2];
|
||||||
|
|
||||||
|
/* hrd */
|
||||||
|
int initial_cpb_removal_delay;
|
||||||
|
int initial_cpb_removal_delay_offset;
|
||||||
|
int64_t i_reordered_pts_delay;
|
||||||
|
|
||||||
|
/* Current MB DCT coeffs */
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
ALIGNED_64( dctcoef luma16x16_dc[3][16] );
|
||||||
|
ALIGNED_16( dctcoef chroma_dc[2][8] );
|
||||||
|
// FIXME share memory?
|
||||||
|
ALIGNED_64( dctcoef luma8x8[12][64] );
|
||||||
|
ALIGNED_64( dctcoef luma4x4[16*3][16] );
|
||||||
|
} dct;
|
||||||
|
|
||||||
|
/* MB table and cache for current frame/mb */
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
int i_mb_width;
|
||||||
|
int i_mb_height;
|
||||||
|
int i_mb_count; /* number of mbs in a frame */
|
||||||
|
|
||||||
|
/* Chroma subsampling */
|
||||||
|
int chroma_h_shift;
|
||||||
|
int chroma_v_shift;
|
||||||
|
|
||||||
|
/* Strides */
|
||||||
|
int i_mb_stride;
|
||||||
|
int i_b8_stride;
|
||||||
|
int i_b4_stride;
|
||||||
|
int left_b8[2];
|
||||||
|
int left_b4[2];
|
||||||
|
|
||||||
|
/* Current index */
|
||||||
|
int i_mb_x;
|
||||||
|
int i_mb_y;
|
||||||
|
int i_mb_xy;
|
||||||
|
int i_b8_xy;
|
||||||
|
int i_b4_xy;
|
||||||
|
|
||||||
|
/* Search parameters */
|
||||||
|
int i_me_method;
|
||||||
|
int i_subpel_refine;
|
||||||
|
int b_chroma_me;
|
||||||
|
int b_trellis;
|
||||||
|
int b_noise_reduction;
|
||||||
|
int b_dct_decimate;
|
||||||
|
int i_psy_rd; /* Psy RD strength--fixed point value*/
|
||||||
|
int i_psy_trellis; /* Psy trellis strength--fixed point value*/
|
||||||
|
|
||||||
|
int b_interlaced;
|
||||||
|
int b_adaptive_mbaff; /* MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs */
|
||||||
|
|
||||||
|
/* Allowed qpel MV range to stay within the picture + emulated edge pixels */
|
||||||
|
int mv_min[2];
|
||||||
|
int mv_max[2];
|
||||||
|
int mv_miny_row[3]; /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
|
||||||
|
int mv_maxy_row[3];
|
||||||
|
/* Subpel MV range for motion search.
|
||||||
|
* same mv_min/max but includes levels' i_mv_range. */
|
||||||
|
int mv_min_spel[2];
|
||||||
|
int mv_max_spel[2];
|
||||||
|
int mv_miny_spel_row[3];
|
||||||
|
int mv_maxy_spel_row[3];
|
||||||
|
/* Fullpel MV range for motion search */
|
||||||
|
ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
|
||||||
|
int mv_miny_fpel_row[3];
|
||||||
|
int mv_maxy_fpel_row[3];
|
||||||
|
|
||||||
|
/* neighboring MBs */
|
||||||
|
unsigned int i_neighbour;
|
||||||
|
unsigned int i_neighbour8[4]; /* neighbours of each 8x8 or 4x4 block that are available */
|
||||||
|
unsigned int i_neighbour4[16]; /* at the time the block is coded */
|
||||||
|
unsigned int i_neighbour_intra; /* for constrained intra pred */
|
||||||
|
unsigned int i_neighbour_frame; /* ignoring slice boundaries */
|
||||||
|
int i_mb_type_top;
|
||||||
|
int i_mb_type_left[2];
|
||||||
|
int i_mb_type_topleft;
|
||||||
|
int i_mb_type_topright;
|
||||||
|
int i_mb_prev_xy;
|
||||||
|
int i_mb_left_xy[2];
|
||||||
|
int i_mb_top_xy;
|
||||||
|
int i_mb_topleft_xy;
|
||||||
|
int i_mb_topright_xy;
|
||||||
|
int i_mb_top_y;
|
||||||
|
int i_mb_topleft_y;
|
||||||
|
int i_mb_topright_y;
|
||||||
|
const x264_left_table_t *left_index_table;
|
||||||
|
int i_mb_top_mbpair_xy;
|
||||||
|
int topleft_partition;
|
||||||
|
int b_allow_skip;
|
||||||
|
int field_decoding_flag;
|
||||||
|
|
||||||
|
/**** thread synchronization ends here ****/
|
||||||
|
/* subsequent variables are either thread-local or constant,
|
||||||
|
* and won't be copied from one thread to another */
|
||||||
|
|
||||||
|
/* mb table */
|
||||||
|
uint8_t *base; /* base pointer for all malloced data in this mb */
|
||||||
|
int8_t *type; /* mb type */
|
||||||
|
uint8_t *partition; /* mb partition */
|
||||||
|
int8_t *qp; /* mb qp */
|
||||||
|
int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */
|
||||||
|
int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
|
||||||
|
/* actually has only 7 entries; set to 8 for write-combining optimizations */
|
||||||
|
uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */
|
||||||
|
int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
|
||||||
|
int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */
|
||||||
|
uint8_t (*mvd[2])[8][2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
|
||||||
|
int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */
|
||||||
|
int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
|
||||||
|
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
|
||||||
|
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
|
||||||
|
int32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */
|
||||||
|
uint8_t *field;
|
||||||
|
|
||||||
|
/* buffer for weighted versions of the reference frames */
|
||||||
|
pixel *p_weight_buf[X264_REF_MAX];
|
||||||
|
|
||||||
|
/* current value */
|
||||||
|
int i_type;
|
||||||
|
int i_partition;
|
||||||
|
ALIGNED_4( uint8_t i_sub_partition[4] );
|
||||||
|
int b_transform_8x8;
|
||||||
|
|
||||||
|
int i_cbp_luma;
|
||||||
|
int i_cbp_chroma;
|
||||||
|
|
||||||
|
int i_intra16x16_pred_mode;
|
||||||
|
int i_chroma_pred_mode;
|
||||||
|
|
||||||
|
/* skip flags for i4x4 and i8x8
|
||||||
|
* 0 = encode as normal.
|
||||||
|
* 1 (non-RD only) = the DCT is still in h->dct, restore fdec and skip reconstruction.
|
||||||
|
* 2 (RD only) = the DCT has since been overwritten by RD; restore that too. */
|
||||||
|
int i_skip_intra;
|
||||||
|
/* skip flag for motion compensation */
|
||||||
|
/* if we've already done MC, we don't need to do it again */
|
||||||
|
int b_skip_mc;
|
||||||
|
/* set to true if we are re-encoding a macroblock. */
|
||||||
|
int b_reencode_mb;
|
||||||
|
int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */
|
||||||
|
int b_deblock_rdo;
|
||||||
|
int b_overflow; /* If CAVLC had a level code overflow during bitstream writing. */
|
||||||
|
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
/* space for p_fenc and p_fdec */
|
||||||
|
#define FENC_STRIDE 16
|
||||||
|
#define FDEC_STRIDE 32
|
||||||
|
ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
|
||||||
|
ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
|
||||||
|
|
||||||
|
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
|
||||||
|
ALIGNED_32( pixel i4x4_fdec_buf[16*16] );
|
||||||
|
ALIGNED_32( pixel i8x8_fdec_buf[16*16] );
|
||||||
|
ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
|
||||||
|
ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
|
||||||
|
uint32_t i4x4_nnz_buf[4];
|
||||||
|
uint32_t i8x8_nnz_buf[4];
|
||||||
|
|
||||||
|
/* Psy trellis DCT data */
|
||||||
|
ALIGNED_64( dctcoef fenc_dct8[4][64] );
|
||||||
|
ALIGNED_64( dctcoef fenc_dct4[16][16] );
|
||||||
|
|
||||||
|
/* Psy RD SATD/SA8D scores cache */
|
||||||
|
ALIGNED_64( uint32_t fenc_satd_cache[32] );
|
||||||
|
ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
|
||||||
|
|
||||||
|
int i4x4_cbp;
|
||||||
|
int i8x8_cbp;
|
||||||
|
|
||||||
|
/* pointer over mb of the frame to be compressed */
|
||||||
|
pixel *p_fenc[3]; /* y,u,v */
|
||||||
|
/* pointer to the actual source frame, not a block copy */
|
||||||
|
pixel *p_fenc_plane[3];
|
||||||
|
|
||||||
|
/* pointer over mb of the frame to be reconstructed */
|
||||||
|
pixel *p_fdec[3];
|
||||||
|
|
||||||
|
/* pointer over mb of the references */
|
||||||
|
int i_fref[2];
|
||||||
|
/* [12]: yN, yH, yV, yHV, (NV12 ? uv : I444 ? (uN, uH, uV, uHV, vN, ...)) */
|
||||||
|
pixel *p_fref[2][X264_REF_MAX*2][12];
|
||||||
|
pixel *p_fref_w[X264_REF_MAX*2]; /* weighted fullpel luma */
|
||||||
|
uint16_t *p_integral[2][X264_REF_MAX];
|
||||||
|
|
||||||
|
/* fref stride */
|
||||||
|
int i_stride[3];
|
||||||
|
} pic;
|
||||||
|
|
||||||
|
/* cache */
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
/* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
|
||||||
|
ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
|
||||||
|
|
||||||
|
/* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */
|
||||||
|
ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] );
|
||||||
|
|
||||||
|
/* -1 if unused, -2 if unavailable */
|
||||||
|
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
|
||||||
|
|
||||||
|
/* 0 if not available */
|
||||||
|
ALIGNED_16( int16_t mv[2][X264_SCAN8_LUMA_SIZE][2] );
|
||||||
|
ALIGNED_8( uint8_t mvd[2][X264_SCAN8_LUMA_SIZE][2] );
|
||||||
|
|
||||||
|
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
|
||||||
|
ALIGNED_4( int8_t skip[X264_SCAN8_LUMA_SIZE] );
|
||||||
|
|
||||||
|
ALIGNED_4( int16_t direct_mv[2][4][2] );
|
||||||
|
ALIGNED_4( int8_t direct_ref[2][4] );
|
||||||
|
int direct_partition;
|
||||||
|
ALIGNED_4( int16_t pskip_mv[2] );
|
||||||
|
|
||||||
|
/* number of neighbors (top and left) that used 8x8 dct */
|
||||||
|
int i_neighbour_transform_size;
|
||||||
|
int i_neighbour_skip;
|
||||||
|
|
||||||
|
/* neighbor CBPs */
|
||||||
|
int i_cbp_top;
|
||||||
|
int i_cbp_left;
|
||||||
|
|
||||||
|
/* extra data required for mbaff in mv prediction */
|
||||||
|
int16_t topright_mv[2][3][2];
|
||||||
|
int8_t topright_ref[2][3];
|
||||||
|
|
||||||
|
/* current mb deblock strength */
|
||||||
|
uint8_t (*deblock_strength)[8][4];
|
||||||
|
} cache;
|
||||||
|
|
||||||
|
/* */
|
||||||
|
int i_qp; /* current qp */
|
||||||
|
int i_chroma_qp;
|
||||||
|
int i_last_qp; /* last qp */
|
||||||
|
int i_last_dqp; /* last delta qp */
|
||||||
|
int b_variable_qp; /* whether qp is allowed to vary per macroblock */
|
||||||
|
int b_lossless;
|
||||||
|
int b_direct_auto_read; /* take stats for --direct auto from the 2pass log */
|
||||||
|
int b_direct_auto_write; /* analyse direct modes, to use and/or save */
|
||||||
|
|
||||||
|
/* lambda values */
|
||||||
|
int i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */
|
||||||
|
int i_psy_rd_lambda;
|
||||||
|
int i_chroma_lambda2_offset;
|
||||||
|
|
||||||
|
/* B_direct and weighted prediction */
|
||||||
|
int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4];
|
||||||
|
int16_t (*dist_scale_factor)[4];
|
||||||
|
int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4];
|
||||||
|
int8_t (*bipred_weight)[4];
|
||||||
|
/* maps fref1[0]'s ref indices into the current list0 */
|
||||||
|
#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
|
||||||
|
int8_t map_col_to_list0[X264_REF_MAX+2];
|
||||||
|
int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
|
||||||
|
int8_t deblock_ref_table[X264_REF_MAX*2+2];
|
||||||
|
#define deblock_ref_table(x) h->mb.deblock_ref_table[(x)+2]
|
||||||
|
} mb;
|
||||||
|
|
||||||
|
/* rate control encoding only */
|
||||||
|
x264_ratecontrol_t *rc;
|
||||||
|
|
||||||
|
/* stats */
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
/* Cumulated stats */
|
||||||
|
|
||||||
|
/* per slice info */
|
||||||
|
int i_frame_count[3];
|
||||||
|
int64_t i_frame_size[3];
|
||||||
|
double f_frame_qp[3];
|
||||||
|
int i_consecutive_bframes[X264_BFRAME_MAX+1];
|
||||||
|
/* */
|
||||||
|
double f_ssd_global[3];
|
||||||
|
double f_psnr_average[3];
|
||||||
|
double f_psnr_mean_y[3];
|
||||||
|
double f_psnr_mean_u[3];
|
||||||
|
double f_psnr_mean_v[3];
|
||||||
|
double f_ssim_mean_y[3];
|
||||||
|
double f_frame_duration[3];
|
||||||
|
/* */
|
||||||
|
int64_t i_mb_count[3][19];
|
||||||
|
int64_t i_mb_partition[2][17];
|
||||||
|
int64_t i_mb_count_8x8dct[2];
|
||||||
|
int64_t i_mb_count_ref[2][2][X264_REF_MAX*2];
|
||||||
|
int64_t i_mb_cbp[6];
|
||||||
|
int64_t i_mb_pred_mode[4][13];
|
||||||
|
int64_t i_mb_field[3];
|
||||||
|
/* */
|
||||||
|
int i_direct_score[2];
|
||||||
|
int i_direct_frames[2];
|
||||||
|
/* num p-frames weighted */
|
||||||
|
int i_wpred[2];
|
||||||
|
|
||||||
|
/* Current frame stats */
|
||||||
|
x264_frame_stat_t frame;
|
||||||
|
} stat;
|
||||||
|
|
||||||
|
/* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */
|
||||||
|
udctcoef (*nr_offset)[64];
|
||||||
|
uint32_t (*nr_residual_sum)[64];
|
||||||
|
uint32_t *nr_count;
|
||||||
|
|
||||||
|
ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
|
||||||
|
ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
|
||||||
|
uint32_t nr_count_buf[2][4];
|
||||||
|
|
||||||
|
uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
|
||||||
|
|
||||||
|
/* Buffers that are allocated per-thread even in sliced threads. */
|
||||||
|
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
|
||||||
|
void *scratch_buffer2; /* if the first one's already in use */
|
||||||
|
pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
|
||||||
|
/* Deblock strength values are stored for each 4x4 partition. In MBAFF
|
||||||
|
* there are four extra values that need to be stored, located in [4][i]. */
|
||||||
|
uint8_t (*deblock_strength[2])[2][8][4];
|
||||||
|
|
||||||
|
/* CPU functions dependents */
|
||||||
|
x264_predict_t predict_16x16[4+3];
|
||||||
|
x264_predict8x8_t predict_8x8[9+3];
|
||||||
|
x264_predict_t predict_4x4[9+3];
|
||||||
|
x264_predict_t predict_chroma[4+3];
|
||||||
|
x264_predict_t predict_8x8c[4+3];
|
||||||
|
x264_predict_t predict_8x16c[4+3];
|
||||||
|
x264_predict_8x8_filter_t predict_8x8_filter;
|
||||||
|
|
||||||
|
x264_pixel_function_t pixf;
|
||||||
|
x264_mc_functions_t mc;
|
||||||
|
x264_dct_function_t dctf;
|
||||||
|
x264_zigzag_function_t zigzagf;
|
||||||
|
x264_zigzag_function_t zigzagf_interlaced;
|
||||||
|
x264_zigzag_function_t zigzagf_progressive;
|
||||||
|
x264_quant_function_t quantf;
|
||||||
|
x264_deblock_function_t loopf;
|
||||||
|
x264_bitstream_function_t bsf;
|
||||||
|
|
||||||
|
x264_lookahead_t *lookahead;
|
||||||
|
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
x264_opencl_t opencl;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int sad;
|
||||||
|
int16_t mv[2];
|
||||||
|
} mvsad_t;
|
||||||
|
|
||||||
|
// included at the end because it needs x264_t
|
||||||
|
#include "macroblock.h"
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
|
||||||
|
{
|
||||||
|
int cnt = 0;
|
||||||
|
for( int i = 0; i < i_mvc; i++ )
|
||||||
|
{
|
||||||
|
int mx = (mvc[i][0] + 2) >> 2;
|
||||||
|
int my = (mvc[i][1] + 2) >> 2;
|
||||||
|
uint32_t mv = pack16to32_mask(mx, my);
|
||||||
|
if( !mv || mv == pmv ) continue;
|
||||||
|
dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
|
||||||
|
dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
|
||||||
|
cnt++;
|
||||||
|
}
|
||||||
|
return cnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
|
||||||
|
{
|
||||||
|
int cnt = 0;
|
||||||
|
int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
|
||||||
|
for( int i = 0; i < i_mvc; i++ )
|
||||||
|
{
|
||||||
|
uint32_t mv = M32( mvc[i] );
|
||||||
|
int mx = mvc[i][0];
|
||||||
|
int my = mvc[i][1];
|
||||||
|
if( !mv || mv == pmv ) continue;
|
||||||
|
dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
|
||||||
|
dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
|
||||||
|
cnt++;
|
||||||
|
}
|
||||||
|
return cnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if ARCH_X86 || ARCH_X86_64
|
||||||
|
#include "x86/util.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "rectangle.h"
|
||||||
|
|
||||||
|
#endif
|
||||||
679
common/cpu.c
Normal file
679
common/cpu.c
Normal file
@@ -0,0 +1,679 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* cpu.c: cpu detection
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "base.h"
|
||||||
|
|
||||||
|
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
#endif
|
||||||
|
#if HAVE_SYSCONF
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#if SYS_LINUX
|
||||||
|
#include <sched.h>
|
||||||
|
#endif
|
||||||
|
#if SYS_BEOS
|
||||||
|
#include <kernel/OS.h>
|
||||||
|
#endif
|
||||||
|
#if SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#endif
|
||||||
|
#if SYS_OPENBSD
|
||||||
|
#include <machine/cpu.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const x264_cpu_name_t x264_cpu_names[] =
|
||||||
|
{
|
||||||
|
#if ARCH_X86 || ARCH_X86_64
|
||||||
|
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
|
||||||
|
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
|
||||||
|
{"MMX2", MMX2},
|
||||||
|
{"MMXEXT", MMX2},
|
||||||
|
{"SSE", MMX2|X264_CPU_SSE},
|
||||||
|
#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
|
||||||
|
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
|
||||||
|
{"SSE2", SSE2},
|
||||||
|
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
|
||||||
|
{"LZCNT", SSE2|X264_CPU_LZCNT},
|
||||||
|
{"SSE3", SSE2|X264_CPU_SSE3},
|
||||||
|
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
|
||||||
|
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
|
||||||
|
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
|
||||||
|
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
|
||||||
|
#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
|
||||||
|
{"AVX", AVX},
|
||||||
|
{"XOP", AVX|X264_CPU_XOP},
|
||||||
|
{"FMA4", AVX|X264_CPU_FMA4},
|
||||||
|
{"FMA3", AVX|X264_CPU_FMA3},
|
||||||
|
{"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1},
|
||||||
|
{"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
|
||||||
|
#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
|
||||||
|
{"AVX2", AVX2},
|
||||||
|
{"AVX512", AVX2|X264_CPU_AVX512},
|
||||||
|
#undef AVX2
|
||||||
|
#undef AVX
|
||||||
|
#undef SSE2
|
||||||
|
#undef MMX2
|
||||||
|
{"Cache32", X264_CPU_CACHELINE_32},
|
||||||
|
{"Cache64", X264_CPU_CACHELINE_64},
|
||||||
|
{"SlowAtom", X264_CPU_SLOW_ATOM},
|
||||||
|
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
|
||||||
|
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
|
||||||
|
{"SlowShuffle", X264_CPU_SLOW_SHUFFLE},
|
||||||
|
{"UnalignedStack", X264_CPU_STACK_MOD4},
|
||||||
|
#elif ARCH_PPC
|
||||||
|
{"Altivec", X264_CPU_ALTIVEC},
|
||||||
|
#elif ARCH_ARM
|
||||||
|
{"ARMv6", X264_CPU_ARMV6},
|
||||||
|
{"NEON", X264_CPU_NEON},
|
||||||
|
{"FastNeonMRC", X264_CPU_FAST_NEON_MRC},
|
||||||
|
#elif ARCH_AARCH64
|
||||||
|
{"ARMv8", X264_CPU_ARMV8},
|
||||||
|
{"NEON", X264_CPU_NEON},
|
||||||
|
{"DotProd", X264_CPU_DOTPROD},
|
||||||
|
{"I8MM", X264_CPU_I8MM},
|
||||||
|
{"SVE", X264_CPU_SVE},
|
||||||
|
{"SVE2", X264_CPU_SVE2},
|
||||||
|
#elif ARCH_RISCV64
|
||||||
|
{"RVV", X264_CPU_RVV},
|
||||||
|
#elif ARCH_MIPS
|
||||||
|
{"MSA", X264_CPU_MSA},
|
||||||
|
#elif ARCH_LOONGARCH
|
||||||
|
{"LSX", X264_CPU_LSX},
|
||||||
|
{"LASX", X264_CPU_LASX},
|
||||||
|
#endif
|
||||||
|
{"", 0},
|
||||||
|
};
|
||||||
|
|
||||||
|
static unsigned long x264_getauxval( unsigned long type )
|
||||||
|
{
|
||||||
|
#if HAVE_GETAUXVAL
|
||||||
|
return getauxval( type );
|
||||||
|
#elif HAVE_ELF_AUX_INFO
|
||||||
|
unsigned long aux = 0;
|
||||||
|
elf_aux_info( type, &aux, sizeof(aux) );
|
||||||
|
return aux;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if ((HAVE_ALTIVEC && SYS_LINUX) || (HAVE_ARMV6 && !HAVE_NEON)) && !(HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO)
|
||||||
|
#include <signal.h>
|
||||||
|
#include <setjmp.h>
|
||||||
|
static sigjmp_buf jmpbuf;
|
||||||
|
static volatile sig_atomic_t canjump = 0;
|
||||||
|
|
||||||
|
static void sigill_handler( int sig )
|
||||||
|
{
|
||||||
|
if( !canjump )
|
||||||
|
{
|
||||||
|
signal( sig, SIG_DFL );
|
||||||
|
raise( sig );
|
||||||
|
}
|
||||||
|
|
||||||
|
canjump = 0;
|
||||||
|
siglongjmp( jmpbuf, 1 );
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_MMX
|
||||||
|
int x264_cpu_cpuid_test( void );
|
||||||
|
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
|
||||||
|
uint64_t x264_cpu_xgetbv( int xcr );
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
uint32_t cpu = 0;
|
||||||
|
uint32_t eax, ebx, ecx, edx;
|
||||||
|
uint32_t vendor[4] = {0};
|
||||||
|
uint32_t max_extended_cap, max_basic_cap;
|
||||||
|
|
||||||
|
#if !ARCH_X86_64
|
||||||
|
if( !x264_cpu_cpuid_test() )
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
|
||||||
|
if( max_basic_cap == 0 )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
|
||||||
|
if( edx&0x00800000 )
|
||||||
|
cpu |= X264_CPU_MMX;
|
||||||
|
else
|
||||||
|
return cpu;
|
||||||
|
if( edx&0x02000000 )
|
||||||
|
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
|
||||||
|
if( edx&0x04000000 )
|
||||||
|
cpu |= X264_CPU_SSE2;
|
||||||
|
if( ecx&0x00000001 )
|
||||||
|
cpu |= X264_CPU_SSE3;
|
||||||
|
if( ecx&0x00000200 )
|
||||||
|
cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
|
||||||
|
if( ecx&0x00080000 )
|
||||||
|
cpu |= X264_CPU_SSE4;
|
||||||
|
if( ecx&0x00100000 )
|
||||||
|
cpu |= X264_CPU_SSE42;
|
||||||
|
|
||||||
|
if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
|
||||||
|
{
|
||||||
|
uint64_t xcr0 = x264_cpu_xgetbv( 0 );
|
||||||
|
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
|
||||||
|
{
|
||||||
|
if( ecx&0x10000000 )
|
||||||
|
cpu |= X264_CPU_AVX;
|
||||||
|
if( ecx&0x00001000 )
|
||||||
|
cpu |= X264_CPU_FMA3;
|
||||||
|
|
||||||
|
if( max_basic_cap >= 7 )
|
||||||
|
{
|
||||||
|
x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
|
||||||
|
if( ebx&0x00000008 )
|
||||||
|
cpu |= X264_CPU_BMI1;
|
||||||
|
if( ebx&0x00000100 )
|
||||||
|
cpu |= X264_CPU_BMI2;
|
||||||
|
if( ebx&0x00000020 )
|
||||||
|
cpu |= X264_CPU_AVX2;
|
||||||
|
|
||||||
|
if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
|
||||||
|
{
|
||||||
|
if( (ebx&0xD0030000) == 0xD0030000 )
|
||||||
|
cpu |= X264_CPU_AVX512;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
|
||||||
|
max_extended_cap = eax;
|
||||||
|
|
||||||
|
if( max_extended_cap >= 0x80000001 )
|
||||||
|
{
|
||||||
|
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
|
||||||
|
|
||||||
|
if( ecx&0x00000020 )
|
||||||
|
cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
|
||||||
|
if( ecx&0x00000040 ) /* SSE4a, AMD only */
|
||||||
|
{
|
||||||
|
int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
|
||||||
|
cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
|
||||||
|
if( family == 0x14 )
|
||||||
|
{
|
||||||
|
cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
|
||||||
|
cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
|
||||||
|
cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
|
||||||
|
}
|
||||||
|
if( family == 0x16 )
|
||||||
|
{
|
||||||
|
cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
|
||||||
|
* compared to alternate instruction sequences that this
|
||||||
|
* is equal or faster on almost all such functions. */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cpu & X264_CPU_AVX )
|
||||||
|
{
|
||||||
|
if( ecx&0x00000800 ) /* XOP */
|
||||||
|
cpu |= X264_CPU_XOP;
|
||||||
|
if( ecx&0x00010000 ) /* FMA4 */
|
||||||
|
cpu |= X264_CPU_FMA4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( !strcmp((char*)vendor, "AuthenticAMD") )
|
||||||
|
{
|
||||||
|
if( edx&0x00400000 )
|
||||||
|
cpu |= X264_CPU_MMX2;
|
||||||
|
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
|
||||||
|
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( !strcmp((char*)vendor, "GenuineIntel") )
|
||||||
|
{
|
||||||
|
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
|
||||||
|
int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
|
||||||
|
int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
|
||||||
|
if( family == 6 )
|
||||||
|
{
|
||||||
|
/* Detect Atom CPU */
|
||||||
|
if( model == 28 )
|
||||||
|
{
|
||||||
|
cpu |= X264_CPU_SLOW_ATOM;
|
||||||
|
cpu |= X264_CPU_SLOW_PSHUFB;
|
||||||
|
}
|
||||||
|
/* Conroe has a slow shuffle unit. Check the model number to make sure not
|
||||||
|
* to include crippled low-end Penryns and Nehalems that don't have SSE4. */
|
||||||
|
else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
|
||||||
|
cpu |= X264_CPU_SLOW_SHUFFLE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
|
||||||
|
{
|
||||||
|
/* cacheline size is specified in 3 places, any of which may be missing */
|
||||||
|
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
|
||||||
|
int cache = (ebx&0xff00)>>5; // cflush size
|
||||||
|
if( !cache && max_extended_cap >= 0x80000006 )
|
||||||
|
{
|
||||||
|
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
|
||||||
|
cache = ecx&0xff; // cacheline size
|
||||||
|
}
|
||||||
|
if( !cache && max_basic_cap >= 2 )
|
||||||
|
{
|
||||||
|
// Cache and TLB Information
|
||||||
|
static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
|
||||||
|
static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
|
||||||
|
0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
|
||||||
|
uint32_t buf[4];
|
||||||
|
int max, i = 0;
|
||||||
|
do {
|
||||||
|
x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
|
||||||
|
max = buf[0]&0xff;
|
||||||
|
buf[0] &= ~0xff;
|
||||||
|
for( int j = 0; j < 4; j++ )
|
||||||
|
if( !(buf[j]>>31) )
|
||||||
|
while( buf[j] )
|
||||||
|
{
|
||||||
|
if( strchr( cache32_ids, buf[j]&0xff ) )
|
||||||
|
cache = 32;
|
||||||
|
if( strchr( cache64_ids, buf[j]&0xff ) )
|
||||||
|
cache = 64;
|
||||||
|
buf[j] >>= 8;
|
||||||
|
}
|
||||||
|
} while( ++i < max );
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cache == 32 )
|
||||||
|
cpu |= X264_CPU_CACHELINE_32;
|
||||||
|
else if( cache == 64 )
|
||||||
|
cpu |= X264_CPU_CACHELINE_64;
|
||||||
|
else
|
||||||
|
x264_log_internal( X264_LOG_WARNING, "unable to determine cacheline size\n" );
|
||||||
|
}
|
||||||
|
|
||||||
|
#if STACK_ALIGNMENT < 16
|
||||||
|
cpu |= X264_CPU_STACK_MOD4;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return cpu;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif HAVE_ALTIVEC
|
||||||
|
|
||||||
|
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
|
||||||
|
|
||||||
|
#define HWCAP_PPC_ALTIVEC (1U << 28)
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
|
||||||
|
unsigned long hwcap = x264_getauxval( AT_HWCAP );
|
||||||
|
|
||||||
|
if ( hwcap & HWCAP_PPC_ALTIVEC )
|
||||||
|
flags |= X264_CPU_ALTIVEC;
|
||||||
|
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
/* Thank you VLC */
|
||||||
|
uint32_t cpu = 0;
|
||||||
|
#if SYS_OPENBSD
|
||||||
|
int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
|
||||||
|
#elif SYS_MACOSX
|
||||||
|
int selectors[2] = { CTL_HW, HW_VECTORUNIT };
|
||||||
|
#endif
|
||||||
|
int has_altivec = 0;
|
||||||
|
size_t length = sizeof( has_altivec );
|
||||||
|
#if SYS_MACOSX || SYS_OPENBSD
|
||||||
|
int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
|
||||||
|
#elif SYS_NETBSD
|
||||||
|
int error = sysctlbyname( "machdep.altivec", &has_altivec, &length, NULL, 0 );
|
||||||
|
#else
|
||||||
|
int error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if( error == 0 && has_altivec != 0 )
|
||||||
|
cpu |= X264_CPU_ALTIVEC;
|
||||||
|
|
||||||
|
return cpu;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif SYS_LINUX
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
#ifdef __NO_FPRS__
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
static void (*oldsig)( int );
|
||||||
|
|
||||||
|
oldsig = signal( SIGILL, sigill_handler );
|
||||||
|
if( sigsetjmp( jmpbuf, 1 ) )
|
||||||
|
{
|
||||||
|
signal( SIGILL, oldsig );
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
canjump = 1;
|
||||||
|
asm volatile( "mtspr 256, %0\n\t"
|
||||||
|
"vand 0, 0, 0\n\t"
|
||||||
|
:
|
||||||
|
: "r"(-1) );
|
||||||
|
canjump = 0;
|
||||||
|
|
||||||
|
signal( SIGILL, oldsig );
|
||||||
|
|
||||||
|
return X264_CPU_ALTIVEC;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#elif HAVE_ARMV6
|
||||||
|
|
||||||
|
void x264_cpu_neon_test( void );
|
||||||
|
int x264_cpu_fast_neon_mrc_test( void );
|
||||||
|
|
||||||
|
#define HWCAP_ARM_NEON (1U << 12)
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
flags |= X264_CPU_ARMV6;
|
||||||
|
|
||||||
|
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
|
||||||
|
unsigned long hwcap = x264_getauxval( AT_HWCAP );
|
||||||
|
|
||||||
|
if ( hwcap & HWCAP_ARM_NEON )
|
||||||
|
flags |= X264_CPU_NEON;
|
||||||
|
#else
|
||||||
|
// don't do this hack if compiled with -mfpu=neon
|
||||||
|
#if !HAVE_NEON
|
||||||
|
static void (* oldsig)( int );
|
||||||
|
oldsig = signal( SIGILL, sigill_handler );
|
||||||
|
if( sigsetjmp( jmpbuf, 1 ) )
|
||||||
|
{
|
||||||
|
signal( SIGILL, oldsig );
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
canjump = 1;
|
||||||
|
x264_cpu_neon_test();
|
||||||
|
canjump = 0;
|
||||||
|
signal( SIGILL, oldsig );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
flags |= X264_CPU_NEON;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// fast neon -> arm (Cortex-A9) detection relies on user access to the
|
||||||
|
// cycle counter; this assumes ARMv7 performance counters.
|
||||||
|
// NEON requires at least ARMv7, ARMv8 may require changes here, but
|
||||||
|
// hopefully this hacky detection method will have been replaced by then.
|
||||||
|
// Note that there is potential for a race condition if another program or
|
||||||
|
// x264 instance disables or reinits the counters while x264 is using them,
|
||||||
|
// which may result in incorrect detection and the counters stuck enabled.
|
||||||
|
// right now Apple does not seem to support performance counters for this test
|
||||||
|
// Don't test this on Windows; performance counters are readable, but
|
||||||
|
// the PMNC is not readable.
|
||||||
|
#if !defined(__MACH__) && !defined(_WIN32)
|
||||||
|
flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
|
||||||
|
#endif
|
||||||
|
// TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif HAVE_RISCV64
|
||||||
|
|
||||||
|
#define HWCAP_RISCV64_RVV (1 << ('V' - 'A'))
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
|
||||||
|
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
|
||||||
|
unsigned long hwcap = x264_getauxval( AT_HWCAP );
|
||||||
|
|
||||||
|
if ( hwcap & HWCAP_RISCV64_RVV )
|
||||||
|
flags |= X264_CPU_RVV;
|
||||||
|
#else
|
||||||
|
#if HAVE_RVV
|
||||||
|
flags |= X264_CPU_RVV;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif HAVE_AARCH64
|
||||||
|
|
||||||
|
#if defined(__linux__) || HAVE_ELF_AUX_INFO
|
||||||
|
|
||||||
|
#define HWCAP_AARCH64_ASIMDDP (1U << 20)
|
||||||
|
#define HWCAP_AARCH64_SVE (1U << 22)
|
||||||
|
#define HWCAP2_AARCH64_SVE2 (1U << 1)
|
||||||
|
#define HWCAP2_AARCH64_I8MM (1U << 13)
|
||||||
|
|
||||||
|
static uint32_t detect_flags( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
|
||||||
|
unsigned long hwcap = x264_getauxval( AT_HWCAP );
|
||||||
|
unsigned long hwcap2 = x264_getauxval( AT_HWCAP2 );
|
||||||
|
|
||||||
|
if ( hwcap & HWCAP_AARCH64_ASIMDDP )
|
||||||
|
flags |= X264_CPU_DOTPROD;
|
||||||
|
if ( hwcap2 & HWCAP2_AARCH64_I8MM )
|
||||||
|
flags |= X264_CPU_I8MM;
|
||||||
|
if ( hwcap & HWCAP_AARCH64_SVE )
|
||||||
|
flags |= X264_CPU_SVE;
|
||||||
|
if ( hwcap2 & HWCAP2_AARCH64_SVE2 )
|
||||||
|
flags |= X264_CPU_SVE2;
|
||||||
|
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
|
||||||
|
static int have_feature( const char *feature )
|
||||||
|
{
|
||||||
|
int supported = 0;
|
||||||
|
size_t size = sizeof(supported);
|
||||||
|
if ( sysctlbyname( feature, &supported, &size, NULL, 0 ) )
|
||||||
|
return 0;
|
||||||
|
return supported;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t detect_flags( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
|
||||||
|
if ( have_feature( "hw.optional.arm.FEAT_DotProd" ) )
|
||||||
|
flags |= X264_CPU_DOTPROD;
|
||||||
|
if ( have_feature( "hw.optional.arm.FEAT_I8MM" ) )
|
||||||
|
flags |= X264_CPU_I8MM;
|
||||||
|
/* No SVE and SVE2 feature detection available on Apple platforms. */
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
#include <windows.h>
|
||||||
|
|
||||||
|
static uint32_t detect_flags( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
|
||||||
|
#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
|
||||||
|
if ( IsProcessorFeaturePresent( PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE ) )
|
||||||
|
flags |= X264_CPU_DOTPROD;
|
||||||
|
#endif
|
||||||
|
#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
|
||||||
|
if ( IsProcessorFeaturePresent( PF_ARM_SVE_INSTRUCTIONS_AVAILABLE ) )
|
||||||
|
flags |= X264_CPU_SVE;
|
||||||
|
#endif
|
||||||
|
#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
|
||||||
|
if ( IsProcessorFeaturePresent( PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE ) )
|
||||||
|
flags |= X264_CPU_SVE2;
|
||||||
|
#endif
|
||||||
|
#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
|
||||||
|
/* There's no PF_* flag that indicates whether plain I8MM is available
|
||||||
|
* or not. But if SVE_I8MM is available, that also implies that
|
||||||
|
* regular I8MM is available. */
|
||||||
|
if ( IsProcessorFeaturePresent( PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE ) )
|
||||||
|
flags |= X264_CPU_I8MM;
|
||||||
|
#endif
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = X264_CPU_ARMV8;
|
||||||
|
#if HAVE_NEON
|
||||||
|
flags |= X264_CPU_NEON;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// If these features are enabled unconditionally in the compiler, we can
|
||||||
|
// assume that they are available.
|
||||||
|
#ifdef __ARM_FEATURE_DOTPROD
|
||||||
|
flags |= X264_CPU_DOTPROD;
|
||||||
|
#endif
|
||||||
|
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||||
|
flags |= X264_CPU_I8MM;
|
||||||
|
#endif
|
||||||
|
#ifdef __ARM_FEATURE_SVE
|
||||||
|
flags |= X264_CPU_SVE;
|
||||||
|
#endif
|
||||||
|
#ifdef __ARM_FEATURE_SVE2
|
||||||
|
flags |= X264_CPU_SVE2;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Where possible, try to do runtime detection as well.
|
||||||
|
#if defined(__linux__) || HAVE_ELF_AUX_INFO || \
|
||||||
|
defined(__APPLE__) || defined(_WIN32)
|
||||||
|
flags |= detect_flags();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif HAVE_MSA
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
return X264_CPU_MSA;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif HAVE_LSX
|
||||||
|
|
||||||
|
#define LA_HWCAP_LSX ( 1U << 4 )
|
||||||
|
#define LA_HWCAP_LASX ( 1U << 5 )
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
uint32_t flags = 0;
|
||||||
|
uint32_t hwcap = (uint32_t)x264_getauxval( AT_HWCAP );
|
||||||
|
|
||||||
|
if( hwcap & LA_HWCAP_LSX )
|
||||||
|
flags |= X264_CPU_LSX;
|
||||||
|
if( hwcap & LA_HWCAP_LASX )
|
||||||
|
flags |= X264_CPU_LASX;
|
||||||
|
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
uint32_t x264_cpu_detect( void )
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int x264_cpu_num_processors( void )
|
||||||
|
{
|
||||||
|
#if !HAVE_THREAD
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
#elif SYS_WINDOWS
|
||||||
|
return x264_pthread_num_processors_np();
|
||||||
|
|
||||||
|
#elif SYS_LINUX
|
||||||
|
cpu_set_t p_aff;
|
||||||
|
memset( &p_aff, 0, sizeof(p_aff) );
|
||||||
|
if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
|
||||||
|
return 1;
|
||||||
|
#if HAVE_CPU_COUNT
|
||||||
|
return CPU_COUNT(&p_aff);
|
||||||
|
#else
|
||||||
|
int np = 0;
|
||||||
|
for( size_t bit = 0; bit < 8 * sizeof(p_aff); bit++ )
|
||||||
|
np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
|
||||||
|
return np;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#elif SYS_BEOS
|
||||||
|
system_info info;
|
||||||
|
get_system_info( &info );
|
||||||
|
return info.cpu_count;
|
||||||
|
|
||||||
|
#elif SYS_MACOSX
|
||||||
|
int ncpu;
|
||||||
|
size_t length = sizeof( ncpu );
|
||||||
|
if( sysctlbyname("hw.logicalcpu", &ncpu, &length, NULL, 0) )
|
||||||
|
{
|
||||||
|
ncpu = 1;
|
||||||
|
}
|
||||||
|
return ncpu;
|
||||||
|
|
||||||
|
#elif defined(_SC_NPROCESSORS_ONLN)
|
||||||
|
return sysconf( _SC_NPROCESSORS_ONLN );
|
||||||
|
|
||||||
|
#elif defined(_SC_NPROCESSORS_CONF)
|
||||||
|
return sysconf( _SC_NPROCESSORS_CONF );
|
||||||
|
|
||||||
|
#else
|
||||||
|
return 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
56
common/cpu.h
Normal file
56
common/cpu.h
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* cpu.h: cpu detection
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2004-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_CPU_H
|
||||||
|
#define X264_CPU_H
|
||||||
|
|
||||||
|
X264_API uint32_t x264_cpu_detect( void );
|
||||||
|
X264_API int x264_cpu_num_processors( void );
|
||||||
|
void x264_cpu_emms( void );
|
||||||
|
void x264_cpu_sfence( void );
|
||||||
|
#if HAVE_MMX
|
||||||
|
/* There is no way to forbid the compiler from using float instructions
|
||||||
|
* before the emms so miscompilation could theoretically occur in the
|
||||||
|
* unlikely event that the compiler reorders emms and float instructions. */
|
||||||
|
#if HAVE_X86_INLINE_ASM
|
||||||
|
/* Clobbering memory makes the compiler less likely to reorder code. */
|
||||||
|
#define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \
|
||||||
|
"st(3)","st(4)","st(5)","st(6)","st(7)" )
|
||||||
|
#else
|
||||||
|
#define x264_emms() x264_cpu_emms()
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define x264_emms()
|
||||||
|
#endif
|
||||||
|
#define x264_sfence x264_cpu_sfence
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
const char *name;
|
||||||
|
uint32_t flags;
|
||||||
|
} x264_cpu_name_t;
|
||||||
|
X264_API extern const x264_cpu_name_t x264_cpu_names[];
|
||||||
|
|
||||||
|
#endif
|
||||||
1150
common/dct.c
Normal file
1150
common/dct.c
Normal file
File diff suppressed because it is too large
Load Diff
77
common/dct.h
Normal file
77
common/dct.h
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* dct.h: transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2004-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_DCT_H
|
||||||
|
#define X264_DCT_H
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
// pix1 stride = FENC_STRIDE
|
||||||
|
// pix2 stride = FDEC_STRIDE
|
||||||
|
// p_dst stride = FDEC_STRIDE
|
||||||
|
void (*sub4x4_dct) ( dctcoef dct[16], pixel *pix1, pixel *pix2 );
|
||||||
|
void (*add4x4_idct)( pixel *p_dst, dctcoef dct[16] );
|
||||||
|
|
||||||
|
void (*sub8x8_dct) ( dctcoef dct[4][16], pixel *pix1, pixel *pix2 );
|
||||||
|
void (*sub8x8_dct_dc) ( dctcoef dct[4], pixel *pix1, pixel *pix2 );
|
||||||
|
void (*add8x8_idct) ( pixel *p_dst, dctcoef dct[4][16] );
|
||||||
|
void (*add8x8_idct_dc)( pixel *p_dst, dctcoef dct[4] );
|
||||||
|
|
||||||
|
void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
|
||||||
|
|
||||||
|
void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
|
||||||
|
void (*add16x16_idct) ( pixel *p_dst, dctcoef dct[16][16] );
|
||||||
|
void (*add16x16_idct_dc)( pixel *p_dst, dctcoef dct[16] );
|
||||||
|
|
||||||
|
void (*sub8x8_dct8) ( dctcoef dct[64], pixel *pix1, pixel *pix2 );
|
||||||
|
void (*add8x8_idct8)( pixel *p_dst, dctcoef dct[64] );
|
||||||
|
|
||||||
|
void (*sub16x16_dct8) ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
|
||||||
|
void (*add16x16_idct8)( pixel *p_dst, dctcoef dct[4][64] );
|
||||||
|
|
||||||
|
void (*dct4x4dc) ( dctcoef d[16] );
|
||||||
|
void (*idct4x4dc)( dctcoef d[16] );
|
||||||
|
|
||||||
|
void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
|
||||||
|
|
||||||
|
} x264_dct_function_t;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
void (*scan_8x8)( dctcoef level[64], dctcoef dct[64] );
|
||||||
|
void (*scan_4x4)( dctcoef level[16], dctcoef dct[16] );
|
||||||
|
int (*sub_8x8) ( dctcoef level[64], const pixel *p_src, pixel *p_dst );
|
||||||
|
int (*sub_4x4) ( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||||
|
int (*sub_4x4ac)( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
|
||||||
|
void (*interleave_8x8_cavlc)( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||||
|
|
||||||
|
} x264_zigzag_function_t;
|
||||||
|
|
||||||
|
#define x264_dct_init x264_template(dct_init)
|
||||||
|
void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf );
|
||||||
|
#define x264_zigzag_init x264_template(zigzag_init)
|
||||||
|
void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );
|
||||||
|
|
||||||
|
#endif
|
||||||
851
common/deblock.c
Normal file
851
common/deblock.c
Normal file
@@ -0,0 +1,851 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock.c: deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
* Henrik Gramner <henrik@gramner.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
/* Deblocking filter */
|
||||||
|
static const uint8_t i_alpha_table[52+12*3] =
|
||||||
|
{
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
|
||||||
|
7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
|
||||||
|
25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
|
||||||
|
80, 90,101,113,127,144,162,182,203,226,
|
||||||
|
255,255,
|
||||||
|
255,255,255,255,255,255,255,255,255,255,255,255,
|
||||||
|
};
|
||||||
|
static const uint8_t i_beta_table[52+12*3] =
|
||||||
|
{
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
|
||||||
|
3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
|
||||||
|
8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
|
||||||
|
13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
|
||||||
|
18, 18,
|
||||||
|
18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
|
||||||
|
};
|
||||||
|
static const int8_t i_tc0_table[52+12*3][4] =
|
||||||
|
{
|
||||||
|
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
|
||||||
|
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
|
||||||
|
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
|
||||||
|
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
|
||||||
|
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
|
||||||
|
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
|
||||||
|
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
|
||||||
|
{-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
|
||||||
|
{-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
|
||||||
|
{-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
|
||||||
|
{-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
|
||||||
|
{-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
|
||||||
|
{-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
|
||||||
|
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
|
||||||
|
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
|
||||||
|
};
|
||||||
|
#define alpha_table(x) i_alpha_table[(x)+24]
|
||||||
|
#define beta_table(x) i_beta_table[(x)+24]
|
||||||
|
#define tc0_table(x) i_tc0_table[(x)+24]
|
||||||
|
|
||||||
|
/* From ffmpeg */
|
||||||
|
static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 )
|
||||||
|
{
|
||||||
|
int p2 = pix[-3*xstride];
|
||||||
|
int p1 = pix[-2*xstride];
|
||||||
|
int p0 = pix[-1*xstride];
|
||||||
|
int q0 = pix[ 0*xstride];
|
||||||
|
int q1 = pix[ 1*xstride];
|
||||||
|
int q2 = pix[ 2*xstride];
|
||||||
|
|
||||||
|
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
|
||||||
|
{
|
||||||
|
int tc = tc0;
|
||||||
|
int delta;
|
||||||
|
if( abs( p2 - p0 ) < beta )
|
||||||
|
{
|
||||||
|
if( tc0 )
|
||||||
|
pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 );
|
||||||
|
tc++;
|
||||||
|
}
|
||||||
|
if( abs( q2 - q0 ) < beta )
|
||||||
|
{
|
||||||
|
if( tc0 )
|
||||||
|
pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 );
|
||||||
|
tc++;
|
||||||
|
}
|
||||||
|
|
||||||
|
delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
|
||||||
|
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
|
||||||
|
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < 4; i++ )
|
||||||
|
{
|
||||||
|
if( tc0[i] < 0 )
|
||||||
|
{
|
||||||
|
pix += 4*ystride;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for( int d = 0; d < 4; d++, pix += ystride )
|
||||||
|
deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
for( int d = 0; d < 8; d++, pix += stride )
|
||||||
|
deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
|
||||||
|
}
|
||||||
|
static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
|
||||||
|
}
|
||||||
|
static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc )
|
||||||
|
{
|
||||||
|
int p1 = pix[-2*xstride];
|
||||||
|
int p0 = pix[-1*xstride];
|
||||||
|
int q0 = pix[ 0*xstride];
|
||||||
|
int q1 = pix[ 1*xstride];
|
||||||
|
|
||||||
|
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
|
||||||
|
{
|
||||||
|
int delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
|
||||||
|
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
|
||||||
|
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < 4; i++ )
|
||||||
|
{
|
||||||
|
int tc = tc0[i];
|
||||||
|
if( tc <= 0 )
|
||||||
|
{
|
||||||
|
pix += height*ystride;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for( int d = 0; d < height; d++, pix += ystride-2 )
|
||||||
|
for( int e = 0; e < 2; e++, pix++ )
|
||||||
|
deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
|
||||||
|
}
|
||||||
|
static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
|
||||||
|
}
|
||||||
|
static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
|
||||||
|
}
|
||||||
|
static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||||
|
{
|
||||||
|
deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
int p2 = pix[-3*xstride];
|
||||||
|
int p1 = pix[-2*xstride];
|
||||||
|
int p0 = pix[-1*xstride];
|
||||||
|
int q0 = pix[ 0*xstride];
|
||||||
|
int q1 = pix[ 1*xstride];
|
||||||
|
int q2 = pix[ 2*xstride];
|
||||||
|
|
||||||
|
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
|
||||||
|
{
|
||||||
|
if( abs( p0 - q0 ) < ((alpha >> 2) + 2) )
|
||||||
|
{
|
||||||
|
if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
|
||||||
|
{
|
||||||
|
const int p3 = pix[-4*xstride];
|
||||||
|
pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
|
||||||
|
pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
|
||||||
|
pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
|
||||||
|
}
|
||||||
|
else /* p0' */
|
||||||
|
pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
|
||||||
|
if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
|
||||||
|
{
|
||||||
|
const int q3 = pix[3*xstride];
|
||||||
|
pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
|
||||||
|
pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
|
||||||
|
pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
|
||||||
|
}
|
||||||
|
else /* q0' */
|
||||||
|
pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
|
||||||
|
}
|
||||||
|
else /* p0', q0' */
|
||||||
|
{
|
||||||
|
pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
|
||||||
|
pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
for( int d = 0; d < 16; d++, pix += ystride )
|
||||||
|
deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
|
||||||
|
}
|
||||||
|
static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
for( int d = 0; d < 8; d++, pix += ystride )
|
||||||
|
deblock_edge_luma_intra_c( pix, 1, alpha, beta );
|
||||||
|
}
|
||||||
|
static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
deblock_luma_intra_c( pix, stride, 1, alpha, beta );
|
||||||
|
}
|
||||||
|
static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
int p1 = pix[-2*xstride];
|
||||||
|
int p0 = pix[-1*xstride];
|
||||||
|
int q0 = pix[ 0*xstride];
|
||||||
|
int q1 = pix[ 1*xstride];
|
||||||
|
|
||||||
|
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
|
||||||
|
{
|
||||||
|
pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
|
||||||
|
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
for( int d = 0; d < height; d++, pix += ystride-2 )
|
||||||
|
for( int e = 0; e < width; e++, pix++ )
|
||||||
|
deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
|
||||||
|
}
|
||||||
|
static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
|
||||||
|
}
|
||||||
|
static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
|
||||||
|
}
|
||||||
|
static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
|
||||||
|
}
|
||||||
|
static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
|
||||||
|
{
|
||||||
|
deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
|
||||||
|
int bframe )
|
||||||
|
{
|
||||||
|
for( int dir = 0; dir < 2; dir++ )
|
||||||
|
{
|
||||||
|
int s1 = dir ? 1 : 8;
|
||||||
|
int s2 = dir ? 8 : 1;
|
||||||
|
for( int edge = 0; edge < 4; edge++ )
|
||||||
|
for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
|
||||||
|
{
|
||||||
|
int locn = loc - s2;
|
||||||
|
if( nnz[loc] || nnz[locn] )
|
||||||
|
bs[dir][edge][i] = 2;
|
||||||
|
else if( ref[0][loc] != ref[0][locn] ||
|
||||||
|
abs( mv[0][loc][0] - mv[0][locn][0] ) >= 4 ||
|
||||||
|
abs( mv[0][loc][1] - mv[0][locn][1] ) >= mvy_limit ||
|
||||||
|
(bframe && (ref[1][loc] != ref[1][locn] ||
|
||||||
|
abs( mv[1][loc][0] - mv[1][locn][0] ) >= 4 ||
|
||||||
|
abs( mv[1][loc][1] - mv[1][locn][1] ) >= mvy_limit )))
|
||||||
|
{
|
||||||
|
bs[dir][edge][i] = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
bs[dir][edge][i] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
|
||||||
|
int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
|
||||||
|
{
|
||||||
|
int index_a = i_qp + a;
|
||||||
|
int index_b = i_qp + b;
|
||||||
|
int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
|
||||||
|
int beta = beta_table(index_b) << (BIT_DEPTH-8);
|
||||||
|
int8_t tc[4];
|
||||||
|
|
||||||
|
if( !M32(bS) || !alpha || !beta )
|
||||||
|
return;
|
||||||
|
|
||||||
|
tc[0] = (tc0_table(index_a)[bS[0]] * (1 << (BIT_DEPTH-8))) + b_chroma;
|
||||||
|
tc[1] = (tc0_table(index_a)[bS[1]] * (1 << (BIT_DEPTH-8))) + b_chroma;
|
||||||
|
tc[2] = (tc0_table(index_a)[bS[2]] * (1 << (BIT_DEPTH-8))) + b_chroma;
|
||||||
|
tc[3] = (tc0_table(index_a)[bS[3]] * (1 << (BIT_DEPTH-8))) + b_chroma;
|
||||||
|
|
||||||
|
pf_inter( pix, i_stride, alpha, beta, tc );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
|
||||||
|
int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
|
||||||
|
{
|
||||||
|
int index_a = i_qp + a;
|
||||||
|
int index_b = i_qp + b;
|
||||||
|
int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
|
||||||
|
int beta = beta_table(index_b) << (BIT_DEPTH-8);
|
||||||
|
|
||||||
|
if( !alpha || !beta )
|
||||||
|
return;
|
||||||
|
|
||||||
|
pf_intra( pix, i_stride, alpha, beta );
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
|
||||||
|
{
|
||||||
|
int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
|
||||||
|
|
||||||
|
h->mb.i_neighbour = 0;
|
||||||
|
h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
|
||||||
|
h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy];
|
||||||
|
h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED);
|
||||||
|
h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y;
|
||||||
|
h->mb.i_mb_left_xy[1] =
|
||||||
|
h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
|
||||||
|
if( SLICE_MBAFF )
|
||||||
|
{
|
||||||
|
if( mb_y&1 )
|
||||||
|
{
|
||||||
|
if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
|
||||||
|
h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] )
|
||||||
|
{
|
||||||
|
h->mb.i_mb_top_xy += h->mb.i_mb_stride;
|
||||||
|
h->mb.i_mb_top_y++;
|
||||||
|
}
|
||||||
|
if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
|
||||||
|
h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( mb_x > 0 && (deblock_on_slice_edges ||
|
||||||
|
h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) )
|
||||||
|
h->mb.i_neighbour |= MB_LEFT;
|
||||||
|
if( mb_y > MB_INTERLACED && (deblock_on_slice_edges
|
||||||
|
|| h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) )
|
||||||
|
h->mb.i_neighbour |= MB_TOP;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_deblock_row( x264_t *h, int mb_y )
|
||||||
|
{
|
||||||
|
int b_interlaced = SLICE_MBAFF;
|
||||||
|
int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
|
||||||
|
int b = h->sh.i_beta_offset - QP_BD_OFFSET;
|
||||||
|
int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
|
||||||
|
int stridey = h->fdec->i_stride[0];
|
||||||
|
int strideuv = h->fdec->i_stride[1];
|
||||||
|
int chroma_format = CHROMA_FORMAT;
|
||||||
|
int chroma444 = CHROMA444;
|
||||||
|
int chroma_height = 16 >> CHROMA_V_SHIFT;
|
||||||
|
intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
|
||||||
|
|
||||||
|
for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
|
||||||
|
{
|
||||||
|
x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
|
||||||
|
macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
|
||||||
|
|
||||||
|
int mb_xy = h->mb.i_mb_xy;
|
||||||
|
int transform_8x8 = h->mb.mb_transform_size[mb_xy];
|
||||||
|
int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
|
||||||
|
uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x];
|
||||||
|
|
||||||
|
pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
|
||||||
|
pixel *pixuv = CHROMA_FORMAT ? h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x : NULL;
|
||||||
|
|
||||||
|
if( mb_y & MB_INTERLACED )
|
||||||
|
{
|
||||||
|
pixy -= 15*stridey;
|
||||||
|
if( CHROMA_FORMAT )
|
||||||
|
pixuv -= (chroma_height-1)*strideuv;
|
||||||
|
}
|
||||||
|
|
||||||
|
int stride2y = stridey << MB_INTERLACED;
|
||||||
|
int stride2uv = strideuv << MB_INTERLACED;
|
||||||
|
int qp = h->mb.qp[mb_xy];
|
||||||
|
int qpc = h->chroma_qp_table[qp];
|
||||||
|
int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh;
|
||||||
|
|
||||||
|
#define FILTER( intra, dir, edge, qp, chroma_qp )\
|
||||||
|
do\
|
||||||
|
{\
|
||||||
|
if( !(edge & 1) || !transform_8x8 )\
|
||||||
|
{\
|
||||||
|
deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
|
||||||
|
stride2y, bs[dir][edge], qp, a, b, 0,\
|
||||||
|
h->loopf.deblock_luma##intra[dir] );\
|
||||||
|
if( chroma_format == CHROMA_444 )\
|
||||||
|
{\
|
||||||
|
deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\
|
||||||
|
stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
|
||||||
|
h->loopf.deblock_luma##intra[dir] );\
|
||||||
|
deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
|
||||||
|
stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
|
||||||
|
h->loopf.deblock_luma##intra[dir] );\
|
||||||
|
}\
|
||||||
|
else if( chroma_format == CHROMA_420 && !(edge & 1) )\
|
||||||
|
{\
|
||||||
|
deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
|
||||||
|
stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
|
||||||
|
h->loopf.deblock_chroma##intra[dir] );\
|
||||||
|
}\
|
||||||
|
}\
|
||||||
|
if( chroma_format == CHROMA_422 && (dir || !(edge & 1)) )\
|
||||||
|
{\
|
||||||
|
deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
|
||||||
|
stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
|
||||||
|
h->loopf.deblock_chroma##intra[dir] );\
|
||||||
|
}\
|
||||||
|
} while( 0 )
|
||||||
|
|
||||||
|
if( h->mb.i_neighbour & MB_LEFT )
|
||||||
|
{
|
||||||
|
if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
|
||||||
|
{
|
||||||
|
int luma_qp[2];
|
||||||
|
int chroma_qp[2];
|
||||||
|
int left_qp[2];
|
||||||
|
x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
|
||||||
|
x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
|
||||||
|
x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
|
||||||
|
x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
|
||||||
|
int c = chroma444 ? 0 : 1;
|
||||||
|
|
||||||
|
left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
|
||||||
|
luma_qp[0] = (qp + left_qp[0] + 1) >> 1;
|
||||||
|
chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1;
|
||||||
|
if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] ) )
|
||||||
|
{
|
||||||
|
deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_intra_deblock );
|
||||||
|
if( chroma_format )
|
||||||
|
{
|
||||||
|
deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
|
||||||
|
if( chroma444 )
|
||||||
|
deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_deblock );
|
||||||
|
if( chroma_format )
|
||||||
|
{
|
||||||
|
deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
|
||||||
|
if( chroma444 )
|
||||||
|
deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int offy = MB_INTERLACED ? 4 : 0;
|
||||||
|
int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0;
|
||||||
|
left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
|
||||||
|
luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
|
||||||
|
chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
|
||||||
|
if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[1]] ) )
|
||||||
|
{
|
||||||
|
deblock_edge_intra( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], a, b, 0, luma_intra_deblock );
|
||||||
|
if( chroma_format )
|
||||||
|
{
|
||||||
|
deblock_edge_intra( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
|
||||||
|
if( chroma444 )
|
||||||
|
deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
deblock_edge( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], a, b, 0, luma_deblock );
|
||||||
|
if( chroma_format )
|
||||||
|
{
|
||||||
|
deblock_edge( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
|
||||||
|
if( chroma444 )
|
||||||
|
deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int qpl = h->mb.qp[h->mb.i_mb_xy-1];
|
||||||
|
int qp_left = (qp + qpl + 1) >> 1;
|
||||||
|
int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1;
|
||||||
|
int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] );
|
||||||
|
int intra_deblock = intra_cur || intra_left;
|
||||||
|
|
||||||
|
/* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP.
|
||||||
|
* But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there.
|
||||||
|
* So reset their effective QP to max, to indicate that lack of guarantee. */
|
||||||
|
if( h->fdec->mb_info && M32( bs[0][0] ) )
|
||||||
|
{
|
||||||
|
#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT);
|
||||||
|
RESET_EFFECTIVE_QP(mb_xy);
|
||||||
|
RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( intra_deblock )
|
||||||
|
FILTER( _intra, 0, 0, qp_left, qpc_left );
|
||||||
|
else
|
||||||
|
FILTER( , 0, 0, qp_left, qpc_left );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( !first_edge_only )
|
||||||
|
{
|
||||||
|
FILTER( , 0, 1, qp, qpc );
|
||||||
|
FILTER( , 0, 2, qp, qpc );
|
||||||
|
FILTER( , 0, 3, qp, qpc );
|
||||||
|
}
|
||||||
|
|
||||||
|
if( h->mb.i_neighbour & MB_TOP )
|
||||||
|
{
|
||||||
|
if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] )
|
||||||
|
{
|
||||||
|
int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride;
|
||||||
|
|
||||||
|
for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
|
||||||
|
{
|
||||||
|
int qpt = h->mb.qp[mbn_xy];
|
||||||
|
int qp_top = (qp + qpt + 1) >> 1;
|
||||||
|
int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
|
||||||
|
int intra_top = IS_INTRA( h->mb.type[mbn_xy] );
|
||||||
|
if( intra_cur || intra_top )
|
||||||
|
M32( bs[1][4*j] ) = 0x03030303;
|
||||||
|
|
||||||
|
// deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows
|
||||||
|
deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, a, b, 0, h->loopf.deblock_luma[1] );
|
||||||
|
if( chroma444 )
|
||||||
|
{
|
||||||
|
deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
|
||||||
|
deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
|
||||||
|
}
|
||||||
|
else if( chroma_format )
|
||||||
|
deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, h->loopf.deblock_chroma[1] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int qpt = h->mb.qp[h->mb.i_mb_top_xy];
|
||||||
|
int qp_top = (qp + qpt + 1) >> 1;
|
||||||
|
int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
|
||||||
|
int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
|
||||||
|
int intra_deblock = intra_cur || intra_top;
|
||||||
|
|
||||||
|
/* This edge has been modified, reset effective qp to max. */
|
||||||
|
if( h->fdec->mb_info && M32( bs[1][0] ) )
|
||||||
|
{
|
||||||
|
RESET_EFFECTIVE_QP(mb_xy);
|
||||||
|
RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) && intra_deblock )
|
||||||
|
{
|
||||||
|
FILTER( _intra, 1, 0, qp_top, qpc_top );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( intra_deblock )
|
||||||
|
M32( bs[1][0] ) = 0x03030303;
|
||||||
|
FILTER( , 1, 0, qp_top, qpc_top );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( !first_edge_only )
|
||||||
|
{
|
||||||
|
FILTER( , 1, 1, qp, qpc );
|
||||||
|
FILTER( , 1, 2, qp, qpc );
|
||||||
|
FILTER( , 1, 3, qp, qpc );
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef FILTER
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* For deblock-aware RD.
|
||||||
|
* TODO:
|
||||||
|
* deblock macroblock edges
|
||||||
|
* support analysis partitions smaller than 16x16
|
||||||
|
* deblock chroma for 4:2:0/4:2:2
|
||||||
|
* handle duplicate refs correctly
|
||||||
|
*/
|
||||||
|
void x264_macroblock_deblock( x264_t *h )
|
||||||
|
{
|
||||||
|
int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
|
||||||
|
int b = h->sh.i_beta_offset - QP_BD_OFFSET;
|
||||||
|
int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
|
||||||
|
int intra_cur = IS_INTRA( h->mb.i_type );
|
||||||
|
int qp = h->mb.i_qp;
|
||||||
|
int qpc = h->mb.i_chroma_qp;
|
||||||
|
if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh )
|
||||||
|
return;
|
||||||
|
|
||||||
|
uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
|
||||||
|
if( intra_cur )
|
||||||
|
{
|
||||||
|
M32( bs[0][1] ) = 0x03030303;
|
||||||
|
M64( bs[0][2] ) = 0x0303030303030303ULL;
|
||||||
|
M32( bs[1][1] ) = 0x03030303;
|
||||||
|
M64( bs[1][2] ) = 0x0303030303030303ULL;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
|
||||||
|
bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );
|
||||||
|
|
||||||
|
int transform_8x8 = h->mb.b_transform_8x8;
|
||||||
|
|
||||||
|
#define FILTER( dir, edge )\
|
||||||
|
do\
|
||||||
|
{\
|
||||||
|
deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\
|
||||||
|
FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\
|
||||||
|
h->loopf.deblock_luma[dir] );\
|
||||||
|
if( CHROMA444 )\
|
||||||
|
{\
|
||||||
|
deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\
|
||||||
|
FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
|
||||||
|
h->loopf.deblock_luma[dir] );\
|
||||||
|
deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\
|
||||||
|
FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
|
||||||
|
h->loopf.deblock_luma[dir] );\
|
||||||
|
}\
|
||||||
|
} while( 0 )
|
||||||
|
|
||||||
|
if( !transform_8x8 ) FILTER( 0, 1 );
|
||||||
|
FILTER( 0, 2 );
|
||||||
|
if( !transform_8x8 ) FILTER( 0, 3 );
|
||||||
|
|
||||||
|
if( !transform_8x8 ) FILTER( 1, 1 );
|
||||||
|
FILTER( 1, 2 );
|
||||||
|
if( !transform_8x8 ) FILTER( 1, 3 );
|
||||||
|
|
||||||
|
#undef FILTER
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HAVE_MMX
|
||||||
|
#include "x86/deblock.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_ALTIVEC
|
||||||
|
#include "ppc/deblock.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
#include "arm/deblock.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_AARCH64
|
||||||
|
#include "aarch64/deblock.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_MSA
|
||||||
|
#include "mips/deblock.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_LSX
|
||||||
|
#include "loongarch/deblock.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
|
||||||
|
{
|
||||||
|
pf->deblock_luma[1] = deblock_v_luma_c;
|
||||||
|
pf->deblock_luma[0] = deblock_h_luma_c;
|
||||||
|
pf->deblock_chroma[1] = deblock_v_chroma_c;
|
||||||
|
pf->deblock_h_chroma_420 = deblock_h_chroma_c;
|
||||||
|
pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
|
||||||
|
pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
|
||||||
|
pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
|
||||||
|
pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
|
||||||
|
pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
|
||||||
|
pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
|
||||||
|
pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
|
||||||
|
pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
|
||||||
|
pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
|
||||||
|
pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
|
||||||
|
pf->deblock_strength = deblock_strength_c;
|
||||||
|
|
||||||
|
#if HAVE_MMX
|
||||||
|
if( cpu&X264_CPU_MMX2 )
|
||||||
|
{
|
||||||
|
#if ARCH_X86
|
||||||
|
pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
|
||||||
|
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
|
||||||
|
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
|
||||||
|
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
|
||||||
|
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2;
|
||||||
|
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
|
||||||
|
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
|
||||||
|
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
|
||||||
|
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
|
||||||
|
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
|
||||||
|
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
|
||||||
|
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
|
||||||
|
#endif
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
|
||||||
|
#endif
|
||||||
|
if( cpu&X264_CPU_SSE2 )
|
||||||
|
{
|
||||||
|
pf->deblock_strength = x264_deblock_strength_sse2;
|
||||||
|
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
|
||||||
|
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
|
||||||
|
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
|
||||||
|
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
|
||||||
|
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
|
||||||
|
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
|
||||||
|
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
|
||||||
|
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
|
||||||
|
if( !(cpu&X264_CPU_STACK_MOD4) )
|
||||||
|
{
|
||||||
|
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
|
||||||
|
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
|
||||||
|
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
|
||||||
|
#if HIGH_BIT_DEPTH
|
||||||
|
pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( cpu&X264_CPU_SSSE3 )
|
||||||
|
pf->deblock_strength = x264_deblock_strength_ssse3;
|
||||||
|
if( cpu&X264_CPU_AVX )
|
||||||
|
{
|
||||||
|
pf->deblock_strength = x264_deblock_strength_avx;
|
||||||
|
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
|
||||||
|
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
|
||||||
|
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
|
||||||
|
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
|
||||||
|
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
|
||||||
|
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
|
||||||
|
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
|
||||||
|
if( !(cpu&X264_CPU_STACK_MOD4) )
|
||||||
|
{
|
||||||
|
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
|
||||||
|
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
|
||||||
|
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
|
||||||
|
#if HIGH_BIT_DEPTH
|
||||||
|
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx;
|
||||||
|
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( cpu&X264_CPU_AVX2 )
|
||||||
|
{
|
||||||
|
pf->deblock_strength = x264_deblock_strength_avx2;
|
||||||
|
}
|
||||||
|
if( cpu&X264_CPU_AVX512 )
|
||||||
|
{
|
||||||
|
pf->deblock_strength = x264_deblock_strength_avx512;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
#if HAVE_ALTIVEC
|
||||||
|
if( cpu&X264_CPU_ALTIVEC )
|
||||||
|
{
|
||||||
|
pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
|
||||||
|
pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
|
||||||
|
}
|
||||||
|
#endif // HAVE_ALTIVEC
|
||||||
|
|
||||||
|
#if HAVE_ARMV6 || HAVE_AARCH64
|
||||||
|
if( cpu&X264_CPU_NEON )
|
||||||
|
{
|
||||||
|
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
|
||||||
|
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
|
||||||
|
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
|
||||||
|
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
|
||||||
|
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
|
||||||
|
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
|
||||||
|
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
|
||||||
|
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
|
||||||
|
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
|
||||||
|
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
|
||||||
|
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
|
||||||
|
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
|
||||||
|
pf->deblock_strength = x264_deblock_strength_neon;
|
||||||
|
}
|
||||||
|
#if HAVE_SVE
|
||||||
|
if ( cpu&X264_CPU_SVE )
|
||||||
|
{
|
||||||
|
pf->deblock_chroma[1] = x264_deblock_v_chroma_sve;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_MSA
|
||||||
|
if( cpu&X264_CPU_MSA )
|
||||||
|
{
|
||||||
|
pf->deblock_luma[1] = x264_deblock_v_luma_msa;
|
||||||
|
pf->deblock_luma[0] = x264_deblock_h_luma_msa;
|
||||||
|
pf->deblock_chroma[1] = x264_deblock_v_chroma_msa;
|
||||||
|
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa;
|
||||||
|
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa;
|
||||||
|
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa;
|
||||||
|
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa;
|
||||||
|
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa;
|
||||||
|
pf->deblock_strength = x264_deblock_strength_msa;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_LSX
|
||||||
|
if( cpu&X264_CPU_LSX )
|
||||||
|
{
|
||||||
|
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lsx;
|
||||||
|
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lsx;
|
||||||
|
pf->deblock_strength = x264_deblock_strength_lsx;
|
||||||
|
}
|
||||||
|
if( cpu&X264_CPU_LASX )
|
||||||
|
{
|
||||||
|
pf->deblock_luma[1] = x264_deblock_v_luma_lasx;
|
||||||
|
pf->deblock_luma[0] = x264_deblock_h_luma_lasx;
|
||||||
|
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lasx;
|
||||||
|
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lasx;
|
||||||
|
pf->deblock_strength = x264_deblock_strength_lasx;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
|
||||||
|
/* These functions are equivalent, so don't duplicate them. */
|
||||||
|
pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420;
|
||||||
|
pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra;
|
||||||
|
}
|
||||||
898
common/frame.c
Normal file
898
common/frame.c
Normal file
@@ -0,0 +1,898 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* frame.c: frame handling
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
static int align_stride( int x, int align, int disalign )
|
||||||
|
{
|
||||||
|
x = ALIGN( x, align );
|
||||||
|
if( !(x&(disalign-1)) )
|
||||||
|
x += align;
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int align_plane_size( int x, int disalign )
|
||||||
|
{
|
||||||
|
if( !(x&(disalign-1)) )
|
||||||
|
x += X264_MAX( 128, NATIVE_ALIGN ) / SIZEOF_PIXEL;
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int frame_internal_csp( int external_csp )
|
||||||
|
{
|
||||||
|
int csp = external_csp & X264_CSP_MASK;
|
||||||
|
if( csp == X264_CSP_I400 )
|
||||||
|
return X264_CSP_I400;
|
||||||
|
if( csp >= X264_CSP_I420 && csp < X264_CSP_I422 )
|
||||||
|
return X264_CSP_NV12;
|
||||||
|
if( csp >= X264_CSP_I422 && csp < X264_CSP_I444 )
|
||||||
|
return X264_CSP_NV16;
|
||||||
|
if( csp >= X264_CSP_I444 && csp <= X264_CSP_RGB )
|
||||||
|
return X264_CSP_I444;
|
||||||
|
return X264_CSP_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static x264_frame_t *frame_new( x264_t *h, int b_fdec )
|
||||||
|
{
|
||||||
|
x264_frame_t *frame;
|
||||||
|
int i_csp = frame_internal_csp( h->param.i_csp );
|
||||||
|
int i_mb_count = h->mb.i_mb_count;
|
||||||
|
int i_stride, i_width, i_lines, luma_plane_count;
|
||||||
|
int i_padv = PADV << PARAM_INTERLACED;
|
||||||
|
int align = NATIVE_ALIGN / SIZEOF_PIXEL;
|
||||||
|
#if ARCH_X86 || ARCH_X86_64
|
||||||
|
if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
|
||||||
|
align = 64 / SIZEOF_PIXEL;
|
||||||
|
else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
|
||||||
|
align = 32 / SIZEOF_PIXEL;
|
||||||
|
else
|
||||||
|
align = 16 / SIZEOF_PIXEL;
|
||||||
|
#endif
|
||||||
|
#if ARCH_PPC
|
||||||
|
int disalign = (1<<9) / SIZEOF_PIXEL;
|
||||||
|
#else
|
||||||
|
int disalign = (1<<10) / SIZEOF_PIXEL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
|
||||||
|
PREALLOC_INIT
|
||||||
|
|
||||||
|
/* allocate frame data (+64 for extra data for me) */
|
||||||
|
i_width = h->mb.i_mb_width*16;
|
||||||
|
i_lines = h->mb.i_mb_height*16;
|
||||||
|
i_stride = align_stride( i_width + PADH2, align, disalign );
|
||||||
|
|
||||||
|
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
|
||||||
|
{
|
||||||
|
luma_plane_count = 1;
|
||||||
|
frame->i_plane = 2;
|
||||||
|
for( int i = 0; i < 2; i++ )
|
||||||
|
{
|
||||||
|
frame->i_width[i] = i_width >> i;
|
||||||
|
frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
|
||||||
|
frame->i_stride[i] = i_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( i_csp == X264_CSP_I444 )
|
||||||
|
{
|
||||||
|
luma_plane_count = 3;
|
||||||
|
frame->i_plane = 3;
|
||||||
|
for( int i = 0; i < 3; i++ )
|
||||||
|
{
|
||||||
|
frame->i_width[i] = i_width;
|
||||||
|
frame->i_lines[i] = i_lines;
|
||||||
|
frame->i_stride[i] = i_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( i_csp == X264_CSP_I400 )
|
||||||
|
{
|
||||||
|
luma_plane_count = 1;
|
||||||
|
frame->i_plane = 1;
|
||||||
|
frame->i_width[0] = i_width;
|
||||||
|
frame->i_lines[0] = i_lines;
|
||||||
|
frame->i_stride[0] = i_stride;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
frame->i_csp = i_csp;
|
||||||
|
frame->i_width_lowres = frame->i_width[0]/2;
|
||||||
|
frame->i_lines_lowres = frame->i_lines[0]/2;
|
||||||
|
frame->i_stride_lowres = align_stride( frame->i_width_lowres + PADH2, align, disalign<<1 );
|
||||||
|
|
||||||
|
for( int i = 0; i < h->param.i_bframe + 2; i++ )
|
||||||
|
for( int j = 0; j < h->param.i_bframe + 2; j++ )
|
||||||
|
PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
|
||||||
|
|
||||||
|
frame->i_poc = -1;
|
||||||
|
frame->i_type = X264_TYPE_AUTO;
|
||||||
|
frame->i_qpplus1 = X264_QP_AUTO;
|
||||||
|
frame->i_pts = -1;
|
||||||
|
frame->i_frame = -1;
|
||||||
|
frame->i_frame_num = -1;
|
||||||
|
frame->i_lines_completed = -1;
|
||||||
|
frame->b_fdec = b_fdec;
|
||||||
|
frame->i_pic_struct = PIC_STRUCT_AUTO;
|
||||||
|
frame->i_field_cnt = -1;
|
||||||
|
frame->i_duration =
|
||||||
|
frame->i_cpb_duration =
|
||||||
|
frame->i_dpb_output_delay =
|
||||||
|
frame->i_cpb_delay = 0;
|
||||||
|
frame->i_coded_fields_lookahead =
|
||||||
|
frame->i_cpb_delay_lookahead = -1;
|
||||||
|
|
||||||
|
frame->orig = frame;
|
||||||
|
|
||||||
|
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
|
||||||
|
{
|
||||||
|
int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
|
||||||
|
int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
|
||||||
|
PREALLOC( frame->buffer[1], chroma_plane_size * SIZEOF_PIXEL );
|
||||||
|
if( PARAM_INTERLACED )
|
||||||
|
PREALLOC( frame->buffer_fld[1], chroma_plane_size * SIZEOF_PIXEL );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* all 4 luma planes allocated together, since the cacheline split code
|
||||||
|
* requires them to be in-phase wrt cacheline alignment. */
|
||||||
|
|
||||||
|
for( int p = 0; p < luma_plane_count; p++ )
|
||||||
|
{
|
||||||
|
int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
|
||||||
|
if( h->param.analyse.i_subpel_refine && b_fdec )
|
||||||
|
luma_plane_size *= 4;
|
||||||
|
|
||||||
|
/* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
|
||||||
|
PREALLOC( frame->buffer[p], luma_plane_size * SIZEOF_PIXEL );
|
||||||
|
if( PARAM_INTERLACED )
|
||||||
|
PREALLOC( frame->buffer_fld[p], luma_plane_size * SIZEOF_PIXEL );
|
||||||
|
}
|
||||||
|
|
||||||
|
frame->b_duplicate = 0;
|
||||||
|
|
||||||
|
if( b_fdec ) /* fdec frame */
|
||||||
|
{
|
||||||
|
PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) );
|
||||||
|
PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) );
|
||||||
|
PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
|
||||||
|
PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
|
||||||
|
PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
|
||||||
|
if( h->param.i_bframe )
|
||||||
|
{
|
||||||
|
PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
|
||||||
|
PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
frame->mv[1] = NULL;
|
||||||
|
frame->ref[1] = NULL;
|
||||||
|
}
|
||||||
|
PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
|
||||||
|
PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
|
||||||
|
PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
|
||||||
|
if( h->param.analyse.i_me_method >= X264_ME_ESA )
|
||||||
|
PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
|
||||||
|
if( PARAM_INTERLACED )
|
||||||
|
PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
|
||||||
|
if( h->param.analyse.b_mb_info )
|
||||||
|
PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
|
||||||
|
}
|
||||||
|
else /* fenc frame */
|
||||||
|
{
|
||||||
|
if( h->frames.b_have_lowres )
|
||||||
|
{
|
||||||
|
int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
|
||||||
|
|
||||||
|
PREALLOC( frame->buffer_lowres, 4 * luma_plane_size * SIZEOF_PIXEL );
|
||||||
|
|
||||||
|
for( int j = 0; j <= !!h->param.i_bframe; j++ )
|
||||||
|
for( int i = 0; i <= h->param.i_bframe; i++ )
|
||||||
|
{
|
||||||
|
PREALLOC( frame->lowres_mvs[j][i], 2*i_mb_count*sizeof(int16_t) );
|
||||||
|
PREALLOC( frame->lowres_mv_costs[j][i], i_mb_count*sizeof(int) );
|
||||||
|
}
|
||||||
|
PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
|
||||||
|
for( int j = 0; j <= h->param.i_bframe+1; j++ )
|
||||||
|
for( int i = 0; i <= h->param.i_bframe+1; i++ )
|
||||||
|
PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
|
||||||
|
}
|
||||||
|
if( h->param.rc.i_aq_mode )
|
||||||
|
{
|
||||||
|
PREALLOC( frame->f_qp_offset, i_mb_count * sizeof(float) );
|
||||||
|
PREALLOC( frame->f_qp_offset_aq, i_mb_count * sizeof(float) );
|
||||||
|
if( h->frames.b_have_lowres )
|
||||||
|
PREALLOC( frame->i_inv_qscale_factor, i_mb_count * sizeof(uint16_t) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
|
||||||
|
if( h->frames.b_have_lowres )
|
||||||
|
prealloc_size += NATIVE_ALIGN;
|
||||||
|
}
|
||||||
|
|
||||||
|
PREALLOC_END( frame->base );
|
||||||
|
|
||||||
|
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
|
||||||
|
{
|
||||||
|
int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
|
||||||
|
frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
|
||||||
|
if( PARAM_INTERLACED )
|
||||||
|
frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
|
||||||
|
}
|
||||||
|
|
||||||
|
for( int p = 0; p < luma_plane_count; p++ )
|
||||||
|
{
|
||||||
|
int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
|
||||||
|
if( h->param.analyse.i_subpel_refine && b_fdec )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < 4; i++ )
|
||||||
|
{
|
||||||
|
frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
|
||||||
|
if( PARAM_INTERLACED )
|
||||||
|
frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
|
||||||
|
}
|
||||||
|
frame->plane[p] = frame->filtered[p][0];
|
||||||
|
frame->plane_fld[p] = frame->filtered_fld[p][0];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
|
||||||
|
if( PARAM_INTERLACED )
|
||||||
|
frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( b_fdec )
|
||||||
|
{
|
||||||
|
M32( frame->mv16x16[0] ) = 0;
|
||||||
|
frame->mv16x16++;
|
||||||
|
|
||||||
|
if( h->param.analyse.i_me_method >= X264_ME_ESA )
|
||||||
|
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH_ALIGN;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( h->frames.b_have_lowres )
|
||||||
|
{
|
||||||
|
int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
|
||||||
|
for( int i = 0; i < 4; i++ )
|
||||||
|
frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH_ALIGN + i * luma_plane_size;
|
||||||
|
|
||||||
|
for( int j = 0; j <= !!h->param.i_bframe; j++ )
|
||||||
|
for( int i = 0; i <= h->param.i_bframe; i++ )
|
||||||
|
memset( frame->lowres_mvs[j][i], 0, 2*i_mb_count*sizeof(int16_t) );
|
||||||
|
|
||||||
|
frame->i_intra_cost = frame->lowres_costs[0][0];
|
||||||
|
memset( frame->i_intra_cost, -1, i_mb_count * sizeof(uint16_t) );
|
||||||
|
|
||||||
|
if( h->param.rc.i_aq_mode )
|
||||||
|
/* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
|
||||||
|
memset( frame->i_inv_qscale_factor, 0, i_mb_count * sizeof(uint16_t) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
|
||||||
|
goto fail;
|
||||||
|
if( x264_pthread_cond_init( &frame->cv, NULL ) )
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
frame->opencl.ocl = h->opencl.ocl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return frame;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
x264_free( frame );
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_delete( x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
/* Duplicate frames are blank copies of real frames (including pointers),
|
||||||
|
* so freeing those pointers would cause a double free later. */
|
||||||
|
if( !frame->b_duplicate )
|
||||||
|
{
|
||||||
|
x264_free( frame->base );
|
||||||
|
|
||||||
|
if( frame->param && frame->param->param_free )
|
||||||
|
{
|
||||||
|
x264_param_cleanup( frame->param );
|
||||||
|
frame->param->param_free( frame->param );
|
||||||
|
}
|
||||||
|
if( frame->mb_info_free )
|
||||||
|
frame->mb_info_free( frame->mb_info );
|
||||||
|
if( frame->extra_sei.sei_free )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < frame->extra_sei.num_payloads; i++ )
|
||||||
|
frame->extra_sei.sei_free( frame->extra_sei.payloads[i].payload );
|
||||||
|
frame->extra_sei.sei_free( frame->extra_sei.payloads );
|
||||||
|
}
|
||||||
|
x264_pthread_mutex_destroy( &frame->mutex );
|
||||||
|
x264_pthread_cond_destroy( &frame->cv );
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
x264_opencl_frame_delete( frame );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
x264_free( frame );
|
||||||
|
}
|
||||||
|
|
||||||
|
static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
|
||||||
|
{
|
||||||
|
int width = h->param.i_width >> xshift;
|
||||||
|
int height = h->param.i_height >> yshift;
|
||||||
|
*pix = src->img.plane[plane];
|
||||||
|
*stride = src->img.i_stride[plane];
|
||||||
|
if( src->img.i_csp & X264_CSP_VFLIP )
|
||||||
|
{
|
||||||
|
*pix += (height-1) * *stride;
|
||||||
|
*stride = -*stride;
|
||||||
|
}
|
||||||
|
if( width > abs(*stride) )
|
||||||
|
{
|
||||||
|
x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define get_plane_ptr(...) do { if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; } while( 0 )
|
||||||
|
|
||||||
|
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
|
||||||
|
{
|
||||||
|
int i_csp = src->img.i_csp & X264_CSP_MASK;
|
||||||
|
if( dst->i_csp != frame_internal_csp( i_csp ) )
|
||||||
|
{
|
||||||
|
x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HIGH_BIT_DEPTH
|
||||||
|
if( !(src->img.i_csp & X264_CSP_HIGH_DEPTH) )
|
||||||
|
{
|
||||||
|
x264_log( h, X264_LOG_ERROR, "This build of x264 requires high depth input. Rebuild to support 8-bit input.\n" );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if( src->img.i_csp & X264_CSP_HIGH_DEPTH )
|
||||||
|
{
|
||||||
|
x264_log( h, X264_LOG_ERROR, "This build of x264 requires 8-bit input. Rebuild to support high depth input.\n" );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
|
||||||
|
{
|
||||||
|
x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME )
|
||||||
|
{
|
||||||
|
x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input );
|
||||||
|
dst->i_forced_type = X264_TYPE_AUTO;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
dst->i_forced_type = src->i_type;
|
||||||
|
|
||||||
|
dst->i_type = dst->i_forced_type;
|
||||||
|
dst->i_qpplus1 = src->i_qpplus1;
|
||||||
|
dst->i_pts = dst->i_reordered_pts = src->i_pts;
|
||||||
|
dst->param = src->param;
|
||||||
|
dst->i_pic_struct = src->i_pic_struct;
|
||||||
|
dst->extra_sei = src->extra_sei;
|
||||||
|
dst->opaque = src->opaque;
|
||||||
|
dst->mb_info = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL;
|
||||||
|
dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL;
|
||||||
|
|
||||||
|
uint8_t *pix[3];
|
||||||
|
int stride[3];
|
||||||
|
if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY )
|
||||||
|
{
|
||||||
|
int p = i_csp == X264_CSP_UYVY;
|
||||||
|
h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1],
|
||||||
|
(pixel*)src->img.plane[0], src->img.i_stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
|
||||||
|
}
|
||||||
|
else if( i_csp == X264_CSP_V210 )
|
||||||
|
{
|
||||||
|
stride[0] = src->img.i_stride[0];
|
||||||
|
pix[0] = src->img.plane[0];
|
||||||
|
|
||||||
|
h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
|
||||||
|
dst->plane[1], dst->i_stride[1],
|
||||||
|
(uint32_t *)pix[0], stride[0]/(int)sizeof(uint32_t), h->param.i_width, h->param.i_height );
|
||||||
|
}
|
||||||
|
else if( i_csp >= X264_CSP_BGR )
|
||||||
|
{
|
||||||
|
stride[0] = src->img.i_stride[0];
|
||||||
|
pix[0] = src->img.plane[0];
|
||||||
|
if( src->img.i_csp & X264_CSP_VFLIP )
|
||||||
|
{
|
||||||
|
pix[0] += (h->param.i_height-1) * stride[0];
|
||||||
|
stride[0] = -stride[0];
|
||||||
|
}
|
||||||
|
int b = i_csp==X264_CSP_RGB;
|
||||||
|
h->mc.plane_copy_deinterleave_rgb( dst->plane[1+b], dst->i_stride[1+b],
|
||||||
|
dst->plane[0], dst->i_stride[0],
|
||||||
|
dst->plane[2-b], dst->i_stride[2-b],
|
||||||
|
(pixel*)pix[0], stride[0]/SIZEOF_PIXEL, i_csp==X264_CSP_BGRA ? 4 : 3, h->param.i_width, h->param.i_height );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int v_shift = CHROMA_V_SHIFT;
|
||||||
|
get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
|
||||||
|
h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
|
||||||
|
stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
|
||||||
|
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
|
||||||
|
{
|
||||||
|
get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
|
||||||
|
h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
|
||||||
|
stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height>>v_shift );
|
||||||
|
}
|
||||||
|
else if( i_csp == X264_CSP_NV21 )
|
||||||
|
{
|
||||||
|
get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
|
||||||
|
h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
|
||||||
|
stride[1]/SIZEOF_PIXEL, h->param.i_width>>1, h->param.i_height>>v_shift );
|
||||||
|
}
|
||||||
|
else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
|
||||||
|
{
|
||||||
|
int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
|
||||||
|
get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
|
||||||
|
get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
|
||||||
|
h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
|
||||||
|
(pixel*)pix[1], stride[1]/SIZEOF_PIXEL,
|
||||||
|
(pixel*)pix[2], stride[2]/SIZEOF_PIXEL,
|
||||||
|
h->param.i_width>>1, h->param.i_height>>v_shift );
|
||||||
|
}
|
||||||
|
else if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
|
||||||
|
{
|
||||||
|
get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 );
|
||||||
|
get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 );
|
||||||
|
h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
|
||||||
|
stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
|
||||||
|
h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2],
|
||||||
|
stride[2]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void pixel_memset( pixel *dst, pixel *src, int len, int size )
|
||||||
|
{
|
||||||
|
uint8_t *dstp = (uint8_t*)dst;
|
||||||
|
uint32_t v1 = *src;
|
||||||
|
uint32_t v2 = size == 1 ? v1 + (v1 << 8) : M16( src );
|
||||||
|
uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src );
|
||||||
|
int i = 0;
|
||||||
|
len *= size;
|
||||||
|
|
||||||
|
/* Align the input pointer if it isn't already */
|
||||||
|
if( (intptr_t)dstp & (WORD_SIZE - 1) )
|
||||||
|
{
|
||||||
|
if( size <= 2 && ((intptr_t)dstp & 3) )
|
||||||
|
{
|
||||||
|
if( size == 1 && ((intptr_t)dstp & 1) )
|
||||||
|
dstp[i++] = v1;
|
||||||
|
if( (intptr_t)dstp & 2 )
|
||||||
|
{
|
||||||
|
M16( dstp+i ) = v2;
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( WORD_SIZE == 8 && (intptr_t)dstp & 4 )
|
||||||
|
{
|
||||||
|
M32( dstp+i ) = v4;
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Main copy loop */
|
||||||
|
if( WORD_SIZE == 8 )
|
||||||
|
{
|
||||||
|
uint64_t v8 = v4 + ((uint64_t)v4<<32);
|
||||||
|
for( ; i < len - 7; i+=8 )
|
||||||
|
M64( dstp+i ) = v8;
|
||||||
|
}
|
||||||
|
for( ; i < len - 3; i+=4 )
|
||||||
|
M32( dstp+i ) = v4;
|
||||||
|
|
||||||
|
/* Finish up the last few bytes */
|
||||||
|
if( size <= 2 )
|
||||||
|
{
|
||||||
|
if( i < len - 1 )
|
||||||
|
{
|
||||||
|
M16( dstp+i ) = v2;
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
if( size == 1 && i != len )
|
||||||
|
dstp[i] = v1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ALWAYS_INLINE void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
|
||||||
|
{
|
||||||
|
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
|
||||||
|
for( int y = 0; y < i_height; y++ )
|
||||||
|
{
|
||||||
|
/* left band */
|
||||||
|
pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, SIZEOF_PIXEL<<b_chroma );
|
||||||
|
/* right band */
|
||||||
|
pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, SIZEOF_PIXEL<<b_chroma );
|
||||||
|
}
|
||||||
|
/* upper band */
|
||||||
|
if( b_pad_top )
|
||||||
|
for( int y = 0; y < i_padv; y++ )
|
||||||
|
memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * SIZEOF_PIXEL );
|
||||||
|
/* lower band */
|
||||||
|
if( b_pad_bottom )
|
||||||
|
for( int y = 0; y < i_padv; y++ )
|
||||||
|
memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * SIZEOF_PIXEL );
|
||||||
|
#undef PPIXEL
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y )
|
||||||
|
{
|
||||||
|
int pad_top = mb_y == 0;
|
||||||
|
int pad_bot = mb_y == h->mb.i_mb_height - (1 << SLICE_MBAFF);
|
||||||
|
int b_start = mb_y == h->i_threadslice_start;
|
||||||
|
int b_end = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF);
|
||||||
|
if( mb_y & SLICE_MBAFF )
|
||||||
|
return;
|
||||||
|
for( int i = 0; i < frame->i_plane; i++ )
|
||||||
|
{
|
||||||
|
int h_shift = i && CHROMA_H_SHIFT;
|
||||||
|
int v_shift = i && CHROMA_V_SHIFT;
|
||||||
|
int stride = frame->i_stride[i];
|
||||||
|
int width = 16*h->mb.i_mb_width;
|
||||||
|
int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
|
||||||
|
int padh = PADH;
|
||||||
|
int padv = PADV >> v_shift;
|
||||||
|
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
|
||||||
|
if( b_end && !b_start )
|
||||||
|
height += 4 >> (v_shift + SLICE_MBAFF);
|
||||||
|
pixel *pix;
|
||||||
|
int starty = 16*mb_y - 4*!b_start;
|
||||||
|
if( SLICE_MBAFF )
|
||||||
|
{
|
||||||
|
// border samples for each field are extended separately
|
||||||
|
pix = frame->plane_fld[i] + (starty*stride >> v_shift);
|
||||||
|
plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
|
||||||
|
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
|
||||||
|
|
||||||
|
height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
|
||||||
|
if( b_end && !b_start )
|
||||||
|
height += 4 >> v_shift;
|
||||||
|
pix = frame->plane[i] + (starty*stride >> v_shift);
|
||||||
|
plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pix = frame->plane[i] + (starty*stride >> v_shift);
|
||||||
|
plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
|
||||||
|
{
|
||||||
|
/* during filtering, 8 extra pixels were filtered on each edge,
|
||||||
|
* but up to 3 of the horizontal ones may be wrong.
|
||||||
|
we want to expand border from the last filtered pixel */
|
||||||
|
int b_start = !mb_y;
|
||||||
|
int width = 16*h->mb.i_mb_width + 8;
|
||||||
|
int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16;
|
||||||
|
int padh = PADH - 4;
|
||||||
|
int padv = PADV - 8;
|
||||||
|
for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
|
||||||
|
for( int i = 1; i < 4; i++ )
|
||||||
|
{
|
||||||
|
int stride = frame->i_stride[p];
|
||||||
|
// buffer: 8 luma, to match the hpel filter
|
||||||
|
pixel *pix;
|
||||||
|
if( SLICE_MBAFF )
|
||||||
|
{
|
||||||
|
pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4;
|
||||||
|
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
|
||||||
|
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4;
|
||||||
|
plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_expand_border_lowres( x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < 4; i++ )
|
||||||
|
plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
|
||||||
|
{
|
||||||
|
int v_shift = CHROMA_V_SHIFT;
|
||||||
|
plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
|
||||||
|
PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < frame->i_plane; i++ )
|
||||||
|
{
|
||||||
|
int i_width = h->param.i_width;
|
||||||
|
int h_shift = i && CHROMA_H_SHIFT;
|
||||||
|
int v_shift = i && CHROMA_V_SHIFT;
|
||||||
|
int i_height = h->param.i_height >> v_shift;
|
||||||
|
int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
|
||||||
|
int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
|
||||||
|
|
||||||
|
if( i_padx )
|
||||||
|
{
|
||||||
|
for( int y = 0; y < i_height; y++ )
|
||||||
|
pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
|
||||||
|
&frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
|
||||||
|
i_padx>>h_shift, SIZEOF_PIXEL<<h_shift );
|
||||||
|
}
|
||||||
|
if( i_pady )
|
||||||
|
{
|
||||||
|
for( int y = i_height; y < i_height + i_pady; y++ )
|
||||||
|
memcpy( &frame->plane[i][y*frame->i_stride[i]],
|
||||||
|
&frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]],
|
||||||
|
(i_width + i_padx) * SIZEOF_PIXEL );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < h->fenc->i_plane; i++ )
|
||||||
|
{
|
||||||
|
int v_shift = i && CHROMA_V_SHIFT;
|
||||||
|
int stride = h->fenc->i_stride[i];
|
||||||
|
int height = h->param.i_height >> v_shift;
|
||||||
|
int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
|
||||||
|
pixel *fenc = h->fenc->plane[i] + 16*mb_x;
|
||||||
|
for( int y = height; y < height + pady; y++ )
|
||||||
|
memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*SIZEOF_PIXEL );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* threading */
|
||||||
|
void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
|
||||||
|
{
|
||||||
|
x264_pthread_mutex_lock( &frame->mutex );
|
||||||
|
frame->i_lines_completed = i_lines_completed;
|
||||||
|
x264_pthread_cond_broadcast( &frame->cv );
|
||||||
|
x264_pthread_mutex_unlock( &frame->mutex );
|
||||||
|
}
|
||||||
|
|
||||||
|
int x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
|
||||||
|
{
|
||||||
|
int completed;
|
||||||
|
x264_pthread_mutex_lock( &frame->mutex );
|
||||||
|
while( (completed = frame->i_lines_completed) < i_lines_completed && i_lines_completed >= 0 )
|
||||||
|
x264_pthread_cond_wait( &frame->cv, &frame->mutex );
|
||||||
|
x264_pthread_mutex_unlock( &frame->mutex );
|
||||||
|
return completed;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_threadslice_cond_broadcast( x264_t *h, int pass )
|
||||||
|
{
|
||||||
|
x264_pthread_mutex_lock( &h->mutex );
|
||||||
|
h->i_threadslice_pass = pass;
|
||||||
|
if( pass > 0 )
|
||||||
|
x264_pthread_cond_broadcast( &h->cv );
|
||||||
|
x264_pthread_mutex_unlock( &h->mutex );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_threadslice_cond_wait( x264_t *h, int pass )
|
||||||
|
{
|
||||||
|
x264_pthread_mutex_lock( &h->mutex );
|
||||||
|
while( h->i_threadslice_pass < pass )
|
||||||
|
x264_pthread_cond_wait( &h->cv, &h->mutex );
|
||||||
|
x264_pthread_mutex_unlock( &h->mutex );
|
||||||
|
}
|
||||||
|
|
||||||
|
int x264_frame_new_slice( x264_t *h, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
if( h->param.i_slice_count_max )
|
||||||
|
{
|
||||||
|
int slice_count;
|
||||||
|
if( h->param.b_sliced_threads )
|
||||||
|
slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex );
|
||||||
|
else
|
||||||
|
slice_count = frame->i_slice_count++;
|
||||||
|
if( slice_count >= h->param.i_slice_count_max )
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* list operators */
|
||||||
|
|
||||||
|
void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
while( list[i] ) i++;
|
||||||
|
list[i] = frame;
|
||||||
|
}
|
||||||
|
|
||||||
|
x264_frame_t *x264_frame_pop( x264_frame_t **list )
|
||||||
|
{
|
||||||
|
x264_frame_t *frame;
|
||||||
|
int i = 0;
|
||||||
|
assert( list[0] );
|
||||||
|
while( list[i+1] ) i++;
|
||||||
|
frame = list[i];
|
||||||
|
list[i] = NULL;
|
||||||
|
return frame;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
while( list[i] ) i++;
|
||||||
|
while( i-- )
|
||||||
|
list[i+1] = list[i];
|
||||||
|
list[0] = frame;
|
||||||
|
}
|
||||||
|
|
||||||
|
x264_frame_t *x264_frame_shift( x264_frame_t **list )
|
||||||
|
{
|
||||||
|
x264_frame_t *frame = list[0];
|
||||||
|
int i;
|
||||||
|
for( i = 0; list[i]; i++ )
|
||||||
|
list[i] = list[i+1];
|
||||||
|
assert(frame);
|
||||||
|
return frame;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
assert( frame->i_reference_count > 0 );
|
||||||
|
frame->i_reference_count--;
|
||||||
|
if( frame->i_reference_count == 0 )
|
||||||
|
x264_frame_push( h->frames.unused[frame->b_fdec], frame );
|
||||||
|
}
|
||||||
|
|
||||||
|
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
|
||||||
|
{
|
||||||
|
x264_frame_t *frame;
|
||||||
|
if( h->frames.unused[b_fdec][0] )
|
||||||
|
frame = x264_frame_pop( h->frames.unused[b_fdec] );
|
||||||
|
else
|
||||||
|
frame = frame_new( h, b_fdec );
|
||||||
|
if( !frame )
|
||||||
|
return NULL;
|
||||||
|
frame->b_last_minigop_bframe = 0;
|
||||||
|
frame->i_reference_count = 1;
|
||||||
|
frame->b_intra_calculated = 0;
|
||||||
|
frame->b_scenecut = 1;
|
||||||
|
frame->b_keyframe = 0;
|
||||||
|
frame->b_corrupt = 0;
|
||||||
|
frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1;
|
||||||
|
|
||||||
|
memset( frame->weight, 0, sizeof(frame->weight) );
|
||||||
|
memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
|
||||||
|
|
||||||
|
return frame;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
assert( frame->i_reference_count > 0 );
|
||||||
|
frame->i_reference_count--;
|
||||||
|
if( frame->i_reference_count == 0 )
|
||||||
|
x264_frame_push( h->frames.blank_unused, frame );
|
||||||
|
}
|
||||||
|
|
||||||
|
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
|
||||||
|
{
|
||||||
|
x264_frame_t *frame;
|
||||||
|
if( h->frames.blank_unused[0] )
|
||||||
|
frame = x264_frame_pop( h->frames.blank_unused );
|
||||||
|
else
|
||||||
|
frame = x264_malloc( sizeof(x264_frame_t) );
|
||||||
|
if( !frame )
|
||||||
|
return NULL;
|
||||||
|
frame->b_duplicate = 1;
|
||||||
|
frame->i_reference_count = 1;
|
||||||
|
return frame;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
|
||||||
|
int i_width, int i_height, x264_weight_t *w )
|
||||||
|
{
|
||||||
|
/* Weight horizontal strips of height 16. This was found to be the optimal height
|
||||||
|
* in terms of the cache loads. */
|
||||||
|
while( i_height > 0 )
|
||||||
|
{
|
||||||
|
int x;
|
||||||
|
for( x = 0; x < i_width-8; x += 16 )
|
||||||
|
w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
|
||||||
|
if( x < i_width )
|
||||||
|
w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
|
||||||
|
i_height -= 16;
|
||||||
|
dst += 16 * i_dst_stride;
|
||||||
|
src += 16 * i_src_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_delete_list( x264_frame_t **list )
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
if( !list )
|
||||||
|
return;
|
||||||
|
while( list[i] )
|
||||||
|
x264_frame_delete( list[i++] );
|
||||||
|
x264_free( list );
|
||||||
|
}
|
||||||
|
|
||||||
|
int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int max_size )
|
||||||
|
{
|
||||||
|
if( max_size < 0 )
|
||||||
|
return -1;
|
||||||
|
slist->i_max_size = max_size;
|
||||||
|
slist->i_size = 0;
|
||||||
|
CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
|
||||||
|
if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
|
||||||
|
x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
|
||||||
|
x264_pthread_cond_init( &slist->cv_empty, NULL ) )
|
||||||
|
return -1;
|
||||||
|
return 0;
|
||||||
|
fail:
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist )
|
||||||
|
{
|
||||||
|
x264_pthread_mutex_destroy( &slist->mutex );
|
||||||
|
x264_pthread_cond_destroy( &slist->cv_fill );
|
||||||
|
x264_pthread_cond_destroy( &slist->cv_empty );
|
||||||
|
x264_frame_delete_list( slist->list );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
x264_pthread_mutex_lock( &slist->mutex );
|
||||||
|
while( slist->i_size == slist->i_max_size )
|
||||||
|
x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
|
||||||
|
slist->list[ slist->i_size++ ] = frame;
|
||||||
|
x264_pthread_mutex_unlock( &slist->mutex );
|
||||||
|
x264_pthread_cond_broadcast( &slist->cv_fill );
|
||||||
|
}
|
||||||
|
|
||||||
|
x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist )
|
||||||
|
{
|
||||||
|
x264_frame_t *frame;
|
||||||
|
x264_pthread_mutex_lock( &slist->mutex );
|
||||||
|
while( !slist->i_size )
|
||||||
|
x264_pthread_cond_wait( &slist->cv_fill, &slist->mutex );
|
||||||
|
frame = slist->list[ --slist->i_size ];
|
||||||
|
slist->list[ slist->i_size ] = NULL;
|
||||||
|
x264_pthread_cond_broadcast( &slist->cv_empty );
|
||||||
|
x264_pthread_mutex_unlock( &slist->mutex );
|
||||||
|
return frame;
|
||||||
|
}
|
||||||
297
common/frame.h
Normal file
297
common/frame.h
Normal file
@@ -0,0 +1,297 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* frame.h: frame handling
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_FRAME_H
|
||||||
|
#define X264_FRAME_H
|
||||||
|
|
||||||
|
/* number of pixels past the edge of the frame, for motion estimation/compensation */
|
||||||
|
#define PADH 32
|
||||||
|
#define PADV 32
|
||||||
|
#define PADH_ALIGN X264_MAX( PADH, NATIVE_ALIGN / SIZEOF_PIXEL )
|
||||||
|
#define PADH2 (PADH_ALIGN + PADH)
|
||||||
|
|
||||||
|
typedef struct x264_frame
|
||||||
|
{
|
||||||
|
/* */
|
||||||
|
uint8_t *base; /* Base pointer for all malloced data in this frame. */
|
||||||
|
int i_poc;
|
||||||
|
int i_delta_poc[2];
|
||||||
|
int i_type;
|
||||||
|
int i_forced_type;
|
||||||
|
int i_qpplus1;
|
||||||
|
int64_t i_pts;
|
||||||
|
int64_t i_dts;
|
||||||
|
int64_t i_reordered_pts;
|
||||||
|
int64_t i_duration; /* in SPS time_scale units (i.e 2 * timebase units) used for vfr */
|
||||||
|
float f_duration; /* in seconds */
|
||||||
|
int64_t i_cpb_duration;
|
||||||
|
int64_t i_cpb_delay; /* in SPS time_scale units (i.e 2 * timebase units) */
|
||||||
|
int64_t i_dpb_output_delay;
|
||||||
|
x264_param_t *param;
|
||||||
|
|
||||||
|
int i_frame; /* Presentation frame number */
|
||||||
|
int i_coded; /* Coded frame number */
|
||||||
|
int64_t i_field_cnt; /* Presentation field count */
|
||||||
|
int i_frame_num; /* 7.4.3 frame_num */
|
||||||
|
int b_kept_as_ref;
|
||||||
|
int i_pic_struct;
|
||||||
|
int b_keyframe;
|
||||||
|
uint8_t b_fdec;
|
||||||
|
uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
|
||||||
|
uint8_t i_bframes; /* number of bframes following this nonb in coded order */
|
||||||
|
float f_qp_avg_rc; /* QPs as decided by ratecontrol */
|
||||||
|
float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
|
||||||
|
float f_crf_avg; /* Average effective CRF for this frame */
|
||||||
|
int i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
|
||||||
|
|
||||||
|
/* YUV buffer */
|
||||||
|
int i_csp; /* Internal csp */
|
||||||
|
int i_plane;
|
||||||
|
int i_stride[3];
|
||||||
|
int i_width[3];
|
||||||
|
int i_lines[3];
|
||||||
|
int i_stride_lowres;
|
||||||
|
int i_width_lowres;
|
||||||
|
int i_lines_lowres;
|
||||||
|
pixel *plane[3];
|
||||||
|
pixel *plane_fld[3];
|
||||||
|
pixel *filtered[3][4]; /* plane[0], H, V, HV */
|
||||||
|
pixel *filtered_fld[3][4];
|
||||||
|
pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
|
||||||
|
uint16_t *integral;
|
||||||
|
|
||||||
|
/* for unrestricted mv we allocate more data than needed
|
||||||
|
* allocated data are stored in buffer */
|
||||||
|
pixel *buffer[4];
|
||||||
|
pixel *buffer_fld[4];
|
||||||
|
pixel *buffer_lowres;
|
||||||
|
|
||||||
|
x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
|
||||||
|
pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */
|
||||||
|
int b_duplicate;
|
||||||
|
struct x264_frame *orig;
|
||||||
|
|
||||||
|
/* motion data */
|
||||||
|
int8_t *mb_type;
|
||||||
|
uint8_t *mb_partition;
|
||||||
|
int16_t (*mv[2])[2];
|
||||||
|
int16_t (*mv16x16)[2];
|
||||||
|
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
|
||||||
|
uint8_t *field;
|
||||||
|
uint8_t *effective_qp;
|
||||||
|
|
||||||
|
/* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
|
||||||
|
* Doesn't need special addressing for intra cost because
|
||||||
|
* lists_used is guaranteed to be zero in that cast. */
|
||||||
|
uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
|
||||||
|
#define LOWRES_COST_MASK ((1<<14)-1)
|
||||||
|
#define LOWRES_COST_SHIFT 14
|
||||||
|
|
||||||
|
int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
|
||||||
|
int8_t *ref[2];
|
||||||
|
int i_ref[2];
|
||||||
|
int ref_poc[2][X264_REF_MAX];
|
||||||
|
int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction
|
||||||
|
|
||||||
|
/* for adaptive B-frame decision.
|
||||||
|
* contains the SATD cost of the lowres frame encoded in various modes
|
||||||
|
* FIXME: how big an array do we need? */
|
||||||
|
int i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
|
||||||
|
int i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
|
||||||
|
int i_satd; // the i_cost_est of the selected frametype
|
||||||
|
int i_intra_mbs[X264_BFRAME_MAX+2];
|
||||||
|
int *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
|
||||||
|
int *i_row_satd;
|
||||||
|
int *i_row_bits;
|
||||||
|
float *f_row_qp;
|
||||||
|
float *f_row_qscale;
|
||||||
|
float *f_qp_offset;
|
||||||
|
float *f_qp_offset_aq;
|
||||||
|
int b_intra_calculated;
|
||||||
|
uint16_t *i_intra_cost;
|
||||||
|
uint16_t *i_propagate_cost;
|
||||||
|
uint16_t *i_inv_qscale_factor;
|
||||||
|
int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
|
||||||
|
float f_weighted_cost_delta[X264_BFRAME_MAX+2];
|
||||||
|
uint32_t i_pixel_sum[3];
|
||||||
|
uint64_t i_pixel_ssd[3];
|
||||||
|
|
||||||
|
/* hrd */
|
||||||
|
x264_hrd_t hrd_timing;
|
||||||
|
|
||||||
|
/* vbv */
|
||||||
|
uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
|
||||||
|
int i_planned_satd[X264_LOOKAHEAD_MAX+1];
|
||||||
|
double f_planned_cpb_duration[X264_LOOKAHEAD_MAX+1];
|
||||||
|
int64_t i_coded_fields_lookahead;
|
||||||
|
int64_t i_cpb_delay_lookahead;
|
||||||
|
|
||||||
|
/* threading */
|
||||||
|
int i_lines_completed; /* in pixels */
|
||||||
|
int i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
|
||||||
|
int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
|
||||||
|
x264_pthread_mutex_t mutex;
|
||||||
|
x264_pthread_cond_t cv;
|
||||||
|
int i_slice_count; /* Atomically written to/read from with slice threads */
|
||||||
|
|
||||||
|
/* periodic intra refresh */
|
||||||
|
float f_pir_position;
|
||||||
|
int i_pir_start_col;
|
||||||
|
int i_pir_end_col;
|
||||||
|
int i_frames_since_pir;
|
||||||
|
|
||||||
|
/* interactive encoder control */
|
||||||
|
int b_corrupt;
|
||||||
|
|
||||||
|
/* user sei */
|
||||||
|
x264_sei_t extra_sei;
|
||||||
|
|
||||||
|
/* user data */
|
||||||
|
void *opaque;
|
||||||
|
|
||||||
|
/* user frame properties */
|
||||||
|
uint8_t *mb_info;
|
||||||
|
void (*mb_info_free)( void* );
|
||||||
|
|
||||||
|
#if HAVE_OPENCL
|
||||||
|
x264_frame_opencl_t opencl;
|
||||||
|
#endif
|
||||||
|
} x264_frame_t;
|
||||||
|
|
||||||
|
/* synchronized frame list */
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
x264_frame_t **list;
|
||||||
|
int i_max_size;
|
||||||
|
int i_size;
|
||||||
|
x264_pthread_mutex_t mutex;
|
||||||
|
x264_pthread_cond_t cv_fill; /* event signaling that the list became fuller */
|
||||||
|
x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */
|
||||||
|
} x264_sync_frame_list_t;
|
||||||
|
|
||||||
|
typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
x264_deblock_inter_t deblock_luma[2];
|
||||||
|
x264_deblock_inter_t deblock_chroma[2];
|
||||||
|
x264_deblock_inter_t deblock_h_chroma_420;
|
||||||
|
x264_deblock_inter_t deblock_h_chroma_422;
|
||||||
|
x264_deblock_intra_t deblock_luma_intra[2];
|
||||||
|
x264_deblock_intra_t deblock_chroma_intra[2];
|
||||||
|
x264_deblock_intra_t deblock_h_chroma_420_intra;
|
||||||
|
x264_deblock_intra_t deblock_h_chroma_422_intra;
|
||||||
|
x264_deblock_inter_t deblock_luma_mbaff;
|
||||||
|
x264_deblock_inter_t deblock_chroma_mbaff;
|
||||||
|
x264_deblock_inter_t deblock_chroma_420_mbaff;
|
||||||
|
x264_deblock_inter_t deblock_chroma_422_mbaff;
|
||||||
|
x264_deblock_intra_t deblock_luma_intra_mbaff;
|
||||||
|
x264_deblock_intra_t deblock_chroma_intra_mbaff;
|
||||||
|
x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
|
||||||
|
x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
|
||||||
|
void (*deblock_strength)( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
|
||||||
|
int bframe );
|
||||||
|
} x264_deblock_function_t;
|
||||||
|
|
||||||
|
#define x264_frame_delete x264_template(frame_delete)
|
||||||
|
void x264_frame_delete( x264_frame_t *frame );
|
||||||
|
|
||||||
|
#define x264_frame_copy_picture x264_template(frame_copy_picture)
|
||||||
|
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
|
||||||
|
|
||||||
|
#define x264_frame_expand_border x264_template(frame_expand_border)
|
||||||
|
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y );
|
||||||
|
#define x264_frame_expand_border_filtered x264_template(frame_expand_border_filtered)
|
||||||
|
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
|
||||||
|
#define x264_frame_expand_border_lowres x264_template(frame_expand_border_lowres)
|
||||||
|
void x264_frame_expand_border_lowres( x264_frame_t *frame );
|
||||||
|
#define x264_frame_expand_border_chroma x264_template(frame_expand_border_chroma)
|
||||||
|
void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane );
|
||||||
|
#define x264_frame_expand_border_mod16 x264_template(frame_expand_border_mod16)
|
||||||
|
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
|
||||||
|
#define x264_expand_border_mbpair x264_template(expand_border_mbpair)
|
||||||
|
void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y );
|
||||||
|
|
||||||
|
#define x264_frame_deblock_row x264_template(frame_deblock_row)
|
||||||
|
void x264_frame_deblock_row( x264_t *h, int mb_y );
|
||||||
|
#define x264_macroblock_deblock x264_template(macroblock_deblock)
|
||||||
|
void x264_macroblock_deblock( x264_t *h );
|
||||||
|
|
||||||
|
#define x264_frame_filter x264_template(frame_filter)
|
||||||
|
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
|
||||||
|
#define x264_frame_init_lowres x264_template(frame_init_lowres)
|
||||||
|
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame );
|
||||||
|
|
||||||
|
#define x264_deblock_init x264_template(deblock_init)
|
||||||
|
void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff );
|
||||||
|
|
||||||
|
#define x264_frame_cond_broadcast x264_template(frame_cond_broadcast)
|
||||||
|
void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed );
|
||||||
|
#define x264_frame_cond_wait x264_template(frame_cond_wait)
|
||||||
|
int x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );
|
||||||
|
#define x264_frame_new_slice x264_template(frame_new_slice)
|
||||||
|
int x264_frame_new_slice( x264_t *h, x264_frame_t *frame );
|
||||||
|
|
||||||
|
#define x264_threadslice_cond_broadcast x264_template(threadslice_cond_broadcast)
|
||||||
|
void x264_threadslice_cond_broadcast( x264_t *h, int pass );
|
||||||
|
#define x264_threadslice_cond_wait x264_template(threadslice_cond_wait)
|
||||||
|
void x264_threadslice_cond_wait( x264_t *h, int pass );
|
||||||
|
|
||||||
|
#define x264_frame_push x264_template(frame_push)
|
||||||
|
X264_API void x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
|
||||||
|
#define x264_frame_pop x264_template(frame_pop)
|
||||||
|
X264_API x264_frame_t *x264_frame_pop( x264_frame_t **list );
|
||||||
|
#define x264_frame_unshift x264_template(frame_unshift)
|
||||||
|
X264_API void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
|
||||||
|
#define x264_frame_shift x264_template(frame_shift)
|
||||||
|
X264_API x264_frame_t *x264_frame_shift( x264_frame_t **list );
|
||||||
|
|
||||||
|
#define x264_frame_push_unused x264_template(frame_push_unused)
|
||||||
|
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
|
||||||
|
#define x264_frame_push_blank_unused x264_template(frame_push_blank_unused)
|
||||||
|
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
|
||||||
|
#define x264_frame_pop_blank_unused x264_template(frame_pop_blank_unused)
|
||||||
|
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
|
||||||
|
#define x264_weight_scale_plane x264_template(weight_scale_plane)
|
||||||
|
void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
|
||||||
|
int i_width, int i_height, x264_weight_t *w );
|
||||||
|
#define x264_frame_pop_unused x264_template(frame_pop_unused)
|
||||||
|
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
|
||||||
|
#define x264_frame_delete_list x264_template(frame_delete_list)
|
||||||
|
void x264_frame_delete_list( x264_frame_t **list );
|
||||||
|
|
||||||
|
#define x264_sync_frame_list_init x264_template(sync_frame_list_init)
|
||||||
|
int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int nelem );
|
||||||
|
#define x264_sync_frame_list_delete x264_template(sync_frame_list_delete)
|
||||||
|
void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist );
|
||||||
|
#define x264_sync_frame_list_push x264_template(sync_frame_list_push)
|
||||||
|
void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame );
|
||||||
|
#define x264_sync_frame_list_pop x264_template(sync_frame_list_pop)
|
||||||
|
x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist );
|
||||||
|
|
||||||
|
#endif
|
||||||
2016
common/loongarch/dct-a.S
Normal file
2016
common/loongarch/dct-a.S
Normal file
File diff suppressed because it is too large
Load Diff
95
common/loongarch/dct.h
Normal file
95
common/loongarch/dct.h
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* dct.h: loongarch transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Peng Zhou <zhoupeng@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_LOONGARCH_DCT_H
|
||||||
|
#define X264_LOONGARCH_DCT_H
|
||||||
|
|
||||||
|
#define x264_sub8x8_dct_lasx x264_template(sub8x8_dct_lasx)
|
||||||
|
void x264_sub8x8_dct_lasx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
|
||||||
|
#define x264_sub16x16_dct_lasx x264_template(sub16x16_dct_lasx)
|
||||||
|
void x264_sub16x16_dct_lasx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
|
||||||
|
|
||||||
|
#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
|
||||||
|
void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
|
||||||
|
#define x264_sub16x16_dct8_lasx x264_template(sub16x16_dct8_lasx)
|
||||||
|
void x264_sub16x16_dct8_lasx( int16_t pi_dct[4][64], uint8_t *p_pix1,
|
||||||
|
uint8_t *p_pix2 );
|
||||||
|
|
||||||
|
#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
|
||||||
|
void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
|
||||||
|
#define x264_add8x8_idct_lasx x264_template(add8x8_idct_lasx)
|
||||||
|
void x264_add8x8_idct_lasx( uint8_t *p_dst, int16_t pi_dct[4][16] );
|
||||||
|
#define x264_add16x16_idct_lasx x264_template(add16x16_idct_lasx)
|
||||||
|
void x264_add16x16_idct_lasx( uint8_t *p_dst, int16_t pi_dct[16][16] );
|
||||||
|
#define x264_add8x8_idct8_lasx x264_template(add8x8_idct8_lasx)
|
||||||
|
void x264_add8x8_idct8_lasx( uint8_t *p_dst, int16_t pi_dct[64] );
|
||||||
|
#define x264_add8x8_idct_dc_lasx x264_template(add8x8_idct_dc_lasx)
|
||||||
|
void x264_add8x8_idct_dc_lasx( uint8_t *p_dst, int16_t dct[4] );
|
||||||
|
#define x264_add16x16_idct_dc_lasx x264_template(add16x16_idct_dc_lasx)
|
||||||
|
void x264_add16x16_idct_dc_lasx( uint8_t *p_dst, int16_t dct[16] );
|
||||||
|
|
||||||
|
#define x264_idct4x4dc_lasx x264_template(idct4x4dc_lasx)
|
||||||
|
void x264_idct4x4dc_lasx( int16_t d[16] );
|
||||||
|
#define x264_dct4x4dc_lasx x264_template(dct4x4dc_lasx)
|
||||||
|
void x264_dct4x4dc_lasx( int16_t d[16] );
|
||||||
|
|
||||||
|
#define x264_zigzag_scan_4x4_frame_lasx x264_template(zigzag_scan_4x4_frame_lasx)
|
||||||
|
void x264_zigzag_scan_4x4_frame_lasx( int16_t level[16], int16_t dct[16] );
|
||||||
|
|
||||||
|
#define x264_sub4x4_dct_lsx x264_template(sub4x4_dct_lsx)
|
||||||
|
void x264_sub4x4_dct_lsx( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
|
||||||
|
#define x264_sub8x8_dct_lsx x264_template(sub8x8_dct_lsx)
|
||||||
|
void x264_sub8x8_dct_lsx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
|
||||||
|
#define x264_sub16x16_dct_lsx x264_template(sub16x16_dct_lsx)
|
||||||
|
void x264_sub16x16_dct_lsx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
|
||||||
|
|
||||||
|
#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
|
||||||
|
void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
|
||||||
|
#define x264_sub16x16_dct8_lsx x264_template(sub16x16_dct8_lsx)
|
||||||
|
void x264_sub16x16_dct8_lsx( int16_t pi_dct[4][64], uint8_t *p_pix1,
|
||||||
|
uint8_t *p_pix2 );
|
||||||
|
|
||||||
|
#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
|
||||||
|
void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
|
||||||
|
#define x264_add8x8_idct_lsx x264_template(add8x8_idct_lsx)
|
||||||
|
void x264_add8x8_idct_lsx( uint8_t *p_dst, int16_t pi_dct[4][16] );
|
||||||
|
#define x264_add16x16_idct_lsx x264_template(add16x16_idct_lsx)
|
||||||
|
void x264_add16x16_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16][16] );
|
||||||
|
#define x264_add8x8_idct8_lsx x264_template(add8x8_idct8_lsx)
|
||||||
|
void x264_add8x8_idct8_lsx( uint8_t *p_dst, int16_t pi_dct[64] );
|
||||||
|
#define x264_add8x8_idct_dc_lsx x264_template(add8x8_idct_dc_lsx)
|
||||||
|
void x264_add8x8_idct_dc_lsx( uint8_t *p_dst, int16_t dct[4] );
|
||||||
|
#define x264_add16x16_idct_dc_lsx x264_template(add16x16_idct_dc_lsx)
|
||||||
|
void x264_add16x16_idct_dc_lsx( uint8_t *p_dst, int16_t dct[16] );
|
||||||
|
|
||||||
|
#define x264_idct4x4dc_lsx x264_template(idct4x4dc_lsx)
|
||||||
|
void x264_idct4x4dc_lsx( int16_t d[16] );
|
||||||
|
#define x264_dct4x4dc_lsx x264_template(dct4x4dc_lsx)
|
||||||
|
void x264_dct4x4dc_lsx( int16_t d[16] );
|
||||||
|
|
||||||
|
#define x264_zigzag_scan_4x4_frame_lsx x264_template(zigzag_scan_4x4_frame_lsx)
|
||||||
|
void x264_zigzag_scan_4x4_frame_lsx( int16_t level[16], int16_t dct[16] );
|
||||||
|
|
||||||
|
#endif
|
||||||
1618
common/loongarch/deblock-a.S
Normal file
1618
common/loongarch/deblock-a.S
Normal file
File diff suppressed because it is too large
Load Diff
54
common/loongarch/deblock.h
Normal file
54
common/loongarch/deblock.h
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock.h: loongarch deblock
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Hao Chen <chenhao@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_LOONGARCH_DEBLOCK_H
|
||||||
|
#define X264_LOONGARCH_DEBLOCK_H
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
#define x264_deblock_v_luma_lasx x264_template(deblock_v_luma_lasx)
|
||||||
|
void x264_deblock_v_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_luma_lasx x264_template(deblock_h_luma_lasx)
|
||||||
|
void x264_deblock_h_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
|
||||||
|
#define x264_deblock_v_luma_intra_lsx x264_template(deblock_v_luma_intra_lsx)
|
||||||
|
void x264_deblock_v_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_luma_intra_lsx x264_template(deblock_h_luma_intra_lsx)
|
||||||
|
void x264_deblock_h_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
|
||||||
|
#define x264_deblock_v_luma_intra_lasx x264_template(deblock_v_luma_intra_lasx)
|
||||||
|
void x264_deblock_v_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_luma_intra_lasx x264_template(deblock_h_luma_intra_lasx)
|
||||||
|
void x264_deblock_h_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_strength_lsx x264_template(deblock_strength_lsx)
|
||||||
|
void x264_deblock_strength_lsx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||||
|
int mvy_limit, int bframe );
|
||||||
|
#define x264_deblock_strength_lasx x264_template(deblock_strength_lasx)
|
||||||
|
void x264_deblock_strength_lasx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||||
|
int mvy_limit, int bframe );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
770
common/loongarch/loongson_asm.S
Normal file
770
common/loongarch/loongson_asm.S
Normal file
@@ -0,0 +1,770 @@
|
|||||||
|
/*********************************************************************
|
||||||
|
* Copyright (c) 2022-2024 Loongson Technology Corporation Limited
|
||||||
|
* Contributed by Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||||
|
* Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||||
|
*
|
||||||
|
* Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
* purpose with or without fee is hereby granted, provided that the above
|
||||||
|
* copyright notice and this permission notice appear in all copies.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
*********************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is a LoongArch assembly helper file and available under ISC
|
||||||
|
* license. It provides a large number of macros and alias to simplify
|
||||||
|
* writing assembly code, especially for LSX and LASX optimizations.
|
||||||
|
*
|
||||||
|
* Any one can modify it or add new features for his/her own purposes.
|
||||||
|
* Contributing a patch will be appreciated as it might be useful for
|
||||||
|
* others as well. Send patches to loongson contributor mentioned above.
|
||||||
|
*
|
||||||
|
* MAJOR version: Usage changes, incompatible with previous version.
|
||||||
|
* MINOR version: Add new macros/functions, or bug fixes.
|
||||||
|
* MICRO version: Comment changes or implementation changes.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define LML_VERSION_MAJOR 0
|
||||||
|
#define LML_VERSION_MINOR 4
|
||||||
|
#define LML_VERSION_MICRO 0
|
||||||
|
|
||||||
|
#define ASM_PREF
|
||||||
|
#define DEFAULT_ALIGN 5
|
||||||
|
|
||||||
|
/*
|
||||||
|
*============================================================================
|
||||||
|
* macros for specific projetc, set them as needed.
|
||||||
|
* Following LoongML macros for your reference.
|
||||||
|
*============================================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro function name, align=DEFAULT_ALIGN
|
||||||
|
.macro endfunc
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
.size ASM_PREF\name, . - ASM_PREF\name
|
||||||
|
.purgem endfunc
|
||||||
|
.endm
|
||||||
|
.text ;
|
||||||
|
.align \align ;
|
||||||
|
.globl ASM_PREF\name ;
|
||||||
|
.type ASM_PREF\name, @function ;
|
||||||
|
ASM_PREF\name: ;
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro const name, align=DEFAULT_ALIGN
|
||||||
|
.macro endconst
|
||||||
|
.size \name, . - \name
|
||||||
|
.purgem endconst
|
||||||
|
.endm
|
||||||
|
.section .rodata
|
||||||
|
.align \align
|
||||||
|
\name:
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
*============================================================================
|
||||||
|
* LoongArch register alias
|
||||||
|
*============================================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define a0 $a0
|
||||||
|
#define a1 $a1
|
||||||
|
#define a2 $a2
|
||||||
|
#define a3 $a3
|
||||||
|
#define a4 $a4
|
||||||
|
#define a5 $a5
|
||||||
|
#define a6 $a6
|
||||||
|
#define a7 $a7
|
||||||
|
|
||||||
|
#define t0 $t0
|
||||||
|
#define t1 $t1
|
||||||
|
#define t2 $t2
|
||||||
|
#define t3 $t3
|
||||||
|
#define t4 $t4
|
||||||
|
#define t5 $t5
|
||||||
|
#define t6 $t6
|
||||||
|
#define t7 $t7
|
||||||
|
#define t8 $t8
|
||||||
|
|
||||||
|
#define s0 $s0
|
||||||
|
#define s1 $s1
|
||||||
|
#define s2 $s2
|
||||||
|
#define s3 $s3
|
||||||
|
#define s4 $s4
|
||||||
|
#define s5 $s5
|
||||||
|
#define s6 $s6
|
||||||
|
#define s7 $s7
|
||||||
|
#define s8 $s8
|
||||||
|
|
||||||
|
#define zero $zero
|
||||||
|
#define sp $sp
|
||||||
|
#define ra $ra
|
||||||
|
|
||||||
|
#define fa0 $fa0
|
||||||
|
#define fa1 $fa1
|
||||||
|
#define fa2 $fa2
|
||||||
|
#define fa3 $fa3
|
||||||
|
#define fa4 $fa4
|
||||||
|
#define fa5 $fa5
|
||||||
|
#define fa6 $fa6
|
||||||
|
#define fa7 $fa7
|
||||||
|
#define ft0 $ft0
|
||||||
|
#define ft1 $ft1
|
||||||
|
#define ft2 $ft2
|
||||||
|
#define ft3 $ft3
|
||||||
|
#define ft4 $ft4
|
||||||
|
#define ft5 $ft5
|
||||||
|
#define ft6 $ft6
|
||||||
|
#define ft7 $ft7
|
||||||
|
#define ft8 $ft8
|
||||||
|
#define ft9 $ft9
|
||||||
|
#define ft10 $ft10
|
||||||
|
#define ft11 $ft11
|
||||||
|
#define ft12 $ft12
|
||||||
|
#define ft13 $ft13
|
||||||
|
#define ft14 $ft14
|
||||||
|
#define ft15 $ft15
|
||||||
|
#define fs0 $fs0
|
||||||
|
#define fs1 $fs1
|
||||||
|
#define fs2 $fs2
|
||||||
|
#define fs3 $fs3
|
||||||
|
#define fs4 $fs4
|
||||||
|
#define fs5 $fs5
|
||||||
|
#define fs6 $fs6
|
||||||
|
#define fs7 $fs7
|
||||||
|
|
||||||
|
#define f0 $f0
|
||||||
|
#define f1 $f1
|
||||||
|
#define f2 $f2
|
||||||
|
#define f3 $f3
|
||||||
|
#define f4 $f4
|
||||||
|
#define f5 $f5
|
||||||
|
#define f6 $f6
|
||||||
|
#define f7 $f7
|
||||||
|
#define f8 $f8
|
||||||
|
#define f9 $f9
|
||||||
|
#define f10 $f10
|
||||||
|
#define f11 $f11
|
||||||
|
#define f12 $f12
|
||||||
|
#define f13 $f13
|
||||||
|
#define f14 $f14
|
||||||
|
#define f15 $f15
|
||||||
|
#define f16 $f16
|
||||||
|
#define f17 $f17
|
||||||
|
#define f18 $f18
|
||||||
|
#define f19 $f19
|
||||||
|
#define f20 $f20
|
||||||
|
#define f21 $f21
|
||||||
|
#define f22 $f22
|
||||||
|
#define f23 $f23
|
||||||
|
#define f24 $f24
|
||||||
|
#define f25 $f25
|
||||||
|
#define f26 $f26
|
||||||
|
#define f27 $f27
|
||||||
|
#define f28 $f28
|
||||||
|
#define f29 $f29
|
||||||
|
#define f30 $f30
|
||||||
|
#define f31 $f31
|
||||||
|
|
||||||
|
#define vr0 $vr0
|
||||||
|
#define vr1 $vr1
|
||||||
|
#define vr2 $vr2
|
||||||
|
#define vr3 $vr3
|
||||||
|
#define vr4 $vr4
|
||||||
|
#define vr5 $vr5
|
||||||
|
#define vr6 $vr6
|
||||||
|
#define vr7 $vr7
|
||||||
|
#define vr8 $vr8
|
||||||
|
#define vr9 $vr9
|
||||||
|
#define vr10 $vr10
|
||||||
|
#define vr11 $vr11
|
||||||
|
#define vr12 $vr12
|
||||||
|
#define vr13 $vr13
|
||||||
|
#define vr14 $vr14
|
||||||
|
#define vr15 $vr15
|
||||||
|
#define vr16 $vr16
|
||||||
|
#define vr17 $vr17
|
||||||
|
#define vr18 $vr18
|
||||||
|
#define vr19 $vr19
|
||||||
|
#define vr20 $vr20
|
||||||
|
#define vr21 $vr21
|
||||||
|
#define vr22 $vr22
|
||||||
|
#define vr23 $vr23
|
||||||
|
#define vr24 $vr24
|
||||||
|
#define vr25 $vr25
|
||||||
|
#define vr26 $vr26
|
||||||
|
#define vr27 $vr27
|
||||||
|
#define vr28 $vr28
|
||||||
|
#define vr29 $vr29
|
||||||
|
#define vr30 $vr30
|
||||||
|
#define vr31 $vr31
|
||||||
|
|
||||||
|
#define xr0 $xr0
|
||||||
|
#define xr1 $xr1
|
||||||
|
#define xr2 $xr2
|
||||||
|
#define xr3 $xr3
|
||||||
|
#define xr4 $xr4
|
||||||
|
#define xr5 $xr5
|
||||||
|
#define xr6 $xr6
|
||||||
|
#define xr7 $xr7
|
||||||
|
#define xr8 $xr8
|
||||||
|
#define xr9 $xr9
|
||||||
|
#define xr10 $xr10
|
||||||
|
#define xr11 $xr11
|
||||||
|
#define xr12 $xr12
|
||||||
|
#define xr13 $xr13
|
||||||
|
#define xr14 $xr14
|
||||||
|
#define xr15 $xr15
|
||||||
|
#define xr16 $xr16
|
||||||
|
#define xr17 $xr17
|
||||||
|
#define xr18 $xr18
|
||||||
|
#define xr19 $xr19
|
||||||
|
#define xr20 $xr20
|
||||||
|
#define xr21 $xr21
|
||||||
|
#define xr22 $xr22
|
||||||
|
#define xr23 $xr23
|
||||||
|
#define xr24 $xr24
|
||||||
|
#define xr25 $xr25
|
||||||
|
#define xr26 $xr26
|
||||||
|
#define xr27 $xr27
|
||||||
|
#define xr28 $xr28
|
||||||
|
#define xr29 $xr29
|
||||||
|
#define xr30 $xr30
|
||||||
|
#define xr31 $xr31
|
||||||
|
|
||||||
|
/*
|
||||||
|
*============================================================================
|
||||||
|
* LSX/LASX synthesize instructions
|
||||||
|
*============================================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Dot product of byte vector elements
|
||||||
|
* Arguments : Inputs - vj, vk
|
||||||
|
* Outputs - vd
|
||||||
|
* Return Type - halfword
|
||||||
|
*/
|
||||||
|
.macro vdp2.h.bu vd, vj, vk
|
||||||
|
vmulwev.h.bu \vd, \vj, \vk
|
||||||
|
vmaddwod.h.bu \vd, \vj, \vk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vdp2.h.bu.b vd, vj, vk
|
||||||
|
vmulwev.h.bu.b \vd, \vj, \vk
|
||||||
|
vmaddwod.h.bu.b \vd, \vj, \vk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vdp2.w.h vd, vj, vk
|
||||||
|
vmulwev.w.h \vd, \vj, \vk
|
||||||
|
vmaddwod.w.h \vd, \vj, \vk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvdp2.h.bu xd, xj, xk
|
||||||
|
xvmulwev.h.bu \xd, \xj, \xk
|
||||||
|
xvmaddwod.h.bu \xd, \xj, \xk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvdp2.h.bu.b xd, xj, xk
|
||||||
|
xvmulwev.h.bu.b \xd, \xj, \xk
|
||||||
|
xvmaddwod.h.bu.b \xd, \xj, \xk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvdp2.w.h xd, xj, xk
|
||||||
|
xvmulwev.w.h \xd, \xj, \xk
|
||||||
|
xvmaddwod.w.h \xd, \xj, \xk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Dot product & addition of halfword vector elements
|
||||||
|
* Arguments : Inputs - vj, vk
|
||||||
|
* Outputs - vd
|
||||||
|
* Return Type - twice size of input
|
||||||
|
*/
|
||||||
|
.macro vdp2add.h.bu vd, vj, vk
|
||||||
|
vmaddwev.h.bu \vd, \vj, \vk
|
||||||
|
vmaddwod.h.bu \vd, \vj, \vk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vdp2add.h.bu.b vd, vj, vk
|
||||||
|
vmaddwev.h.bu.b \vd, \vj, \vk
|
||||||
|
vmaddwod.h.bu.b \vd, \vj, \vk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vdp2add.w.h vd, vj, vk
|
||||||
|
vmaddwev.w.h \vd, \vj, \vk
|
||||||
|
vmaddwod.w.h \vd, \vj, \vk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvdp2add.h.bu.b xd, xj, xk
|
||||||
|
xvmaddwev.h.bu.b \xd, \xj, \xk
|
||||||
|
xvmaddwod.h.bu.b \xd, \xj, \xk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvdp2add.w.h xd, xj, xk
|
||||||
|
xvmaddwev.w.h \xd, \xj, \xk
|
||||||
|
xvmaddwod.w.h \xd, \xj, \xk
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Range element vj[i] to vk[i] ~ vj[i]
|
||||||
|
* clip: vj > vk ? vj : vk && vj < va ? vj : va
|
||||||
|
*/
|
||||||
|
.macro vclip.h vd, vj, vk, va
|
||||||
|
vmax.h \vd, \vj, \vk
|
||||||
|
vmin.h \vd, \vd, \va
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vclip.w vd, vj, vk, va
|
||||||
|
vmax.w \vd, \vj, \vk
|
||||||
|
vmin.w \vd, \vd, \va
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvclip.h xd, xj, xk, xa
|
||||||
|
xvmax.h \xd, \xj, \xk
|
||||||
|
xvmin.h \xd, \xd, \xa
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvclip.w xd, xj, xk, xa
|
||||||
|
xvmax.w \xd, \xj, \xk
|
||||||
|
xvmin.w \xd, \xd, \xa
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Range element vj[i] to 0 ~ 255
|
||||||
|
* clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
|
||||||
|
*/
|
||||||
|
.macro vclip255.h vd, vj
|
||||||
|
vmaxi.h \vd, \vj, 0
|
||||||
|
vsat.hu \vd, \vd, 7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vclip255.w vd, vj
|
||||||
|
vmaxi.w \vd, \vj, 0
|
||||||
|
vsat.wu \vd, \vd, 7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvclip255.h xd, xj
|
||||||
|
xvmaxi.h \xd, \xj, 0
|
||||||
|
xvsat.hu \xd, \xd, 7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvclip255.w xd, xj
|
||||||
|
xvmaxi.w \xd, \xj, 0
|
||||||
|
xvsat.wu \xd, \xd, 7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Store elements of vector
|
||||||
|
* vd : Data vector to be stroed
|
||||||
|
* rk : Address of data storage
|
||||||
|
* ra : Offset of address
|
||||||
|
* si : Index of data in vd
|
||||||
|
*/
|
||||||
|
.macro vstelmx.b vd, rk, ra, si
|
||||||
|
add.d \rk, \rk, \ra
|
||||||
|
vstelm.b \vd, \rk, 0, \si
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vstelmx.h vd, rk, ra, si
|
||||||
|
add.d \rk, \rk, \ra
|
||||||
|
vstelm.h \vd, \rk, 0, \si
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vstelmx.w vd, rk, ra, si
|
||||||
|
add.d \rk, \rk, \ra
|
||||||
|
vstelm.w \vd, \rk, 0, \si
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vstelmx.d vd, rk, ra, si
|
||||||
|
add.d \rk, \rk, \ra
|
||||||
|
vstelm.d \vd, \rk, 0, \si
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro vmov xd, xj
|
||||||
|
vor.v \xd, \xj, \xj
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xmov xd, xj
|
||||||
|
xvor.v \xd, \xj, \xj
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro xvstelmx.d xd, rk, ra, si
|
||||||
|
add.d \rk, \rk, \ra
|
||||||
|
xvstelm.d \xd, \rk, 0, \si
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
*============================================================================
|
||||||
|
* LSX/LASX custom macros
|
||||||
|
*============================================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Load 4 float, double, V128, v256 elements with stride.
|
||||||
|
*/
|
||||||
|
.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||||
|
fld.s \out0, \src, 0
|
||||||
|
fldx.s \out1, \src, \stride
|
||||||
|
fldx.s \out2, \src, \stride2
|
||||||
|
fldx.s \out3, \src, \stride3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||||
|
fld.d \out0, \src, 0
|
||||||
|
fldx.d \out1, \src, \stride
|
||||||
|
fldx.d \out2, \src, \stride2
|
||||||
|
fldx.d \out3, \src, \stride3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||||
|
vld \out0, \src, 0
|
||||||
|
vldx \out1, \src, \stride
|
||||||
|
vldx \out2, \src, \stride2
|
||||||
|
vldx \out3, \src, \stride3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||||
|
xvld \out0, \src, 0
|
||||||
|
xvldx \out1, \src, \stride
|
||||||
|
xvldx \out2, \src, \stride2
|
||||||
|
xvldx \out3, \src, \stride3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 4x4 block with half-word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1, out2, out3
|
||||||
|
*/
|
||||||
|
.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||||
|
tmp0, tmp1
|
||||||
|
vilvl.h \tmp0, \in1, \in0
|
||||||
|
vilvl.h \tmp1, \in3, \in2
|
||||||
|
vilvl.w \out0, \tmp1, \tmp0
|
||||||
|
vilvh.w \out2, \tmp1, \tmp0
|
||||||
|
vilvh.d \out1, \out0, \out0
|
||||||
|
vilvh.d \out3, \out0, \out2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 4x4 block with word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1, out2, out3
|
||||||
|
* Details :
|
||||||
|
* Example :
|
||||||
|
* 1, 2, 3, 4 1, 5, 9,13
|
||||||
|
* 5, 6, 7, 8 to 2, 6,10,14
|
||||||
|
* 9,10,11,12 =====> 3, 7,11,15
|
||||||
|
* 13,14,15,16 4, 8,12,16
|
||||||
|
*/
|
||||||
|
.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||||
|
tmp0, tmp1
|
||||||
|
|
||||||
|
vilvl.w \tmp0, \in1, \in0
|
||||||
|
vilvh.w \out1, \in1, \in0
|
||||||
|
vilvl.w \tmp1, \in3, \in2
|
||||||
|
vilvh.w \out3, \in3, \in2
|
||||||
|
|
||||||
|
vilvl.d \out0, \tmp1, \tmp0
|
||||||
|
vilvl.d \out2, \out3, \out1
|
||||||
|
vilvh.d \out3, \out3, \out1
|
||||||
|
vilvh.d \out1, \tmp1, \tmp0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 8x8 block with half-word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||||
|
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||||
|
*/
|
||||||
|
.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
|
||||||
|
out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
|
||||||
|
tmp3, tmp4, tmp5, tmp6, tmp7
|
||||||
|
vilvl.h \tmp0, \in6, \in4
|
||||||
|
vilvl.h \tmp1, \in7, \in5
|
||||||
|
vilvl.h \tmp2, \in2, \in0
|
||||||
|
vilvl.h \tmp3, \in3, \in1
|
||||||
|
|
||||||
|
vilvl.h \tmp4, \tmp1, \tmp0
|
||||||
|
vilvh.h \tmp5, \tmp1, \tmp0
|
||||||
|
vilvl.h \tmp6, \tmp3, \tmp2
|
||||||
|
vilvh.h \tmp7, \tmp3, \tmp2
|
||||||
|
|
||||||
|
vilvh.h \tmp0, \in6, \in4
|
||||||
|
vilvh.h \tmp1, \in7, \in5
|
||||||
|
vilvh.h \tmp2, \in2, \in0
|
||||||
|
vilvh.h \tmp3, \in3, \in1
|
||||||
|
|
||||||
|
vpickev.d \out0, \tmp4, \tmp6
|
||||||
|
vpickod.d \out1, \tmp4, \tmp6
|
||||||
|
vpickev.d \out2, \tmp5, \tmp7
|
||||||
|
vpickod.d \out3, \tmp5, \tmp7
|
||||||
|
|
||||||
|
vilvl.h \tmp4, \tmp1, \tmp0
|
||||||
|
vilvh.h \tmp5, \tmp1, \tmp0
|
||||||
|
vilvl.h \tmp6, \tmp3, \tmp2
|
||||||
|
vilvh.h \tmp7, \tmp3, \tmp2
|
||||||
|
|
||||||
|
vpickev.d \out4, \tmp4, \tmp6
|
||||||
|
vpickod.d \out5, \tmp4, \tmp6
|
||||||
|
vpickev.d \out6, \tmp5, \tmp7
|
||||||
|
vpickod.d \out7, \tmp5, \tmp7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 16x8 block with byte elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||||
|
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||||
|
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||||
|
out0, out1, out2, out3, out4, out5, out6, out7,\
|
||||||
|
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
|
||||||
|
xvilvl.b \tmp0, \in2, \in0
|
||||||
|
xvilvl.b \tmp1, \in3, \in1
|
||||||
|
xvilvl.b \tmp2, \in6, \in4
|
||||||
|
xvilvl.b \tmp3, \in7, \in5
|
||||||
|
xvilvl.b \tmp4, \in10, \in8
|
||||||
|
xvilvl.b \tmp5, \in11, \in9
|
||||||
|
xvilvl.b \tmp6, \in14, \in12
|
||||||
|
xvilvl.b \tmp7, \in15, \in13
|
||||||
|
xvilvl.b \out0, \tmp1, \tmp0
|
||||||
|
xvilvh.b \out1, \tmp1, \tmp0
|
||||||
|
xvilvl.b \out2, \tmp3, \tmp2
|
||||||
|
xvilvh.b \out3, \tmp3, \tmp2
|
||||||
|
xvilvl.b \out4, \tmp5, \tmp4
|
||||||
|
xvilvh.b \out5, \tmp5, \tmp4
|
||||||
|
xvilvl.b \out6, \tmp7, \tmp6
|
||||||
|
xvilvh.b \out7, \tmp7, \tmp6
|
||||||
|
xvilvl.w \tmp0, \out2, \out0
|
||||||
|
xvilvh.w \tmp2, \out2, \out0
|
||||||
|
xvilvl.w \tmp4, \out3, \out1
|
||||||
|
xvilvh.w \tmp6, \out3, \out1
|
||||||
|
xvilvl.w \tmp1, \out6, \out4
|
||||||
|
xvilvh.w \tmp3, \out6, \out4
|
||||||
|
xvilvl.w \tmp5, \out7, \out5
|
||||||
|
xvilvh.w \tmp7, \out7, \out5
|
||||||
|
xvilvl.d \out0, \tmp1, \tmp0
|
||||||
|
xvilvh.d \out1, \tmp1, \tmp0
|
||||||
|
xvilvl.d \out2, \tmp3, \tmp2
|
||||||
|
xvilvh.d \out3, \tmp3, \tmp2
|
||||||
|
xvilvl.d \out4, \tmp5, \tmp4
|
||||||
|
xvilvh.d \out5, \tmp5, \tmp4
|
||||||
|
xvilvl.d \out6, \tmp7, \tmp6
|
||||||
|
xvilvh.d \out7, \tmp7, \tmp6
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 4x4 block with half-word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1, out2, out3
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||||
|
tmp0, tmp1
|
||||||
|
xvilvl.h \tmp0, \in1, \in0
|
||||||
|
xvilvl.h \tmp1, \in3, \in2
|
||||||
|
xvilvl.w \out0, \tmp1, \tmp0
|
||||||
|
xvilvh.w \out2, \tmp1, \tmp0
|
||||||
|
xvilvh.d \out1, \out0, \out0
|
||||||
|
xvilvh.d \out3, \out0, \out2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 4x8 block with half-word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1, out2, out3
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||||
|
tmp0, tmp1
|
||||||
|
xvilvl.h \tmp0, \in2, \in0
|
||||||
|
xvilvl.h \tmp1, \in3, \in1
|
||||||
|
xvilvl.h \out2, \tmp1, \tmp0
|
||||||
|
xvilvh.h \out3, \tmp1, \tmp0
|
||||||
|
|
||||||
|
xvilvl.d \out0, \out2, \out2
|
||||||
|
xvilvh.d \out1, \out2, \out2
|
||||||
|
xvilvl.d \out2, \out3, \out3
|
||||||
|
xvilvh.d \out3, \out3, \out3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 8x8 block with half-word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||||
|
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||||
|
out0, out1, out2, out3, out4, out5, out6, out7, \
|
||||||
|
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
|
||||||
|
xvilvl.h \tmp0, \in6, \in4
|
||||||
|
xvilvl.h \tmp1, \in7, \in5
|
||||||
|
xvilvl.h \tmp2, \in2, \in0
|
||||||
|
xvilvl.h \tmp3, \in3, \in1
|
||||||
|
|
||||||
|
xvilvl.h \tmp4, \tmp1, \tmp0
|
||||||
|
xvilvh.h \tmp5, \tmp1, \tmp0
|
||||||
|
xvilvl.h \tmp6, \tmp3, \tmp2
|
||||||
|
xvilvh.h \tmp7, \tmp3, \tmp2
|
||||||
|
|
||||||
|
xvilvh.h \tmp0, \in6, \in4
|
||||||
|
xvilvh.h \tmp1, \in7, \in5
|
||||||
|
xvilvh.h \tmp2, \in2, \in0
|
||||||
|
xvilvh.h \tmp3, \in3, \in1
|
||||||
|
|
||||||
|
xvpickev.d \out0, \tmp4, \tmp6
|
||||||
|
xvpickod.d \out1, \tmp4, \tmp6
|
||||||
|
xvpickev.d \out2, \tmp5, \tmp7
|
||||||
|
xvpickod.d \out3, \tmp5, \tmp7
|
||||||
|
|
||||||
|
xvilvl.h \tmp4, \tmp1, \tmp0
|
||||||
|
xvilvh.h \tmp5, \tmp1, \tmp0
|
||||||
|
xvilvl.h \tmp6, \tmp3, \tmp2
|
||||||
|
xvilvh.h \tmp7, \tmp3, \tmp2
|
||||||
|
|
||||||
|
xvpickev.d \out4, \tmp4, \tmp6
|
||||||
|
xvpickod.d \out5, \tmp4, \tmp6
|
||||||
|
xvpickev.d \out6, \tmp5, \tmp7
|
||||||
|
xvpickod.d \out7, \tmp5, \tmp7
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 2x4x4 block with half-word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1, out2, out3
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||||
|
tmp0, tmp1, tmp2
|
||||||
|
xvilvh.h \tmp1, \in0, \in1
|
||||||
|
xvilvl.h \out1, \in0, \in1
|
||||||
|
xvilvh.h \tmp0, \in2, \in3
|
||||||
|
xvilvl.h \out3, \in2, \in3
|
||||||
|
|
||||||
|
xvilvh.w \tmp2, \out3, \out1
|
||||||
|
xvilvl.w \out3, \out3, \out1
|
||||||
|
|
||||||
|
xvilvl.w \out2, \tmp0, \tmp1
|
||||||
|
xvilvh.w \tmp1, \tmp0, \tmp1
|
||||||
|
|
||||||
|
xvilvh.d \out0, \out2, \out3
|
||||||
|
xvilvl.d \out2, \out2, \out3
|
||||||
|
xvilvh.d \out1, \tmp1, \tmp2
|
||||||
|
xvilvl.d \out3, \tmp1, \tmp2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 4x4 block with word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1, out2, out3
|
||||||
|
* Details :
|
||||||
|
* Example :
|
||||||
|
* 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
|
||||||
|
* 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
|
||||||
|
* 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
|
||||||
|
* 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||||
|
tmp0, tmp1
|
||||||
|
|
||||||
|
xvilvl.w \tmp0, \in1, \in0
|
||||||
|
xvilvh.w \out1, \in1, \in0
|
||||||
|
xvilvl.w \tmp1, \in3, \in2
|
||||||
|
xvilvh.w \out3, \in3, \in2
|
||||||
|
|
||||||
|
xvilvl.d \out0, \tmp1, \tmp0
|
||||||
|
xvilvl.d \out2, \out3, \out1
|
||||||
|
xvilvh.d \out3, \out3, \out1
|
||||||
|
xvilvh.d \out1, \tmp1, \tmp0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 8x8 block with word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||||
|
* Outputs - out0, out1, out2, out3, out4, out5, out6,
|
||||||
|
* _out7
|
||||||
|
* Example : LASX_TRANSPOSE8x8_W
|
||||||
|
* in0 : 1,2,3,4,5,6,7,8
|
||||||
|
* in1 : 2,2,3,4,5,6,7,8
|
||||||
|
* in2 : 3,2,3,4,5,6,7,8
|
||||||
|
* in3 : 4,2,3,4,5,6,7,8
|
||||||
|
* in4 : 5,2,3,4,5,6,7,8
|
||||||
|
* in5 : 6,2,3,4,5,6,7,8
|
||||||
|
* in6 : 7,2,3,4,5,6,7,8
|
||||||
|
* in7 : 8,2,3,4,5,6,7,8
|
||||||
|
*
|
||||||
|
* out0 : 1,2,3,4,5,6,7,8
|
||||||
|
* out1 : 2,2,2,2,2,2,2,2
|
||||||
|
* out2 : 3,3,3,3,3,3,3,3
|
||||||
|
* out3 : 4,4,4,4,4,4,4,4
|
||||||
|
* out4 : 5,5,5,5,5,5,5,5
|
||||||
|
* out5 : 6,6,6,6,6,6,6,6
|
||||||
|
* out6 : 7,7,7,7,7,7,7,7
|
||||||
|
* out7 : 8,8,8,8,8,8,8,8
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
|
||||||
|
out0, out1, out2, out3, out4, out5, out6, out7,\
|
||||||
|
tmp0, tmp1, tmp2, tmp3
|
||||||
|
xvilvl.w \tmp0, \in2, \in0
|
||||||
|
xvilvl.w \tmp1, \in3, \in1
|
||||||
|
xvilvh.w \tmp2, \in2, \in0
|
||||||
|
xvilvh.w \tmp3, \in3, \in1
|
||||||
|
xvilvl.w \out0, \tmp1, \tmp0
|
||||||
|
xvilvh.w \out1, \tmp1, \tmp0
|
||||||
|
xvilvl.w \out2, \tmp3, \tmp2
|
||||||
|
xvilvh.w \out3, \tmp3, \tmp2
|
||||||
|
|
||||||
|
xvilvl.w \tmp0, \in6, \in4
|
||||||
|
xvilvl.w \tmp1, \in7, \in5
|
||||||
|
xvilvh.w \tmp2, \in6, \in4
|
||||||
|
xvilvh.w \tmp3, \in7, \in5
|
||||||
|
xvilvl.w \out4, \tmp1, \tmp0
|
||||||
|
xvilvh.w \out5, \tmp1, \tmp0
|
||||||
|
xvilvl.w \out6, \tmp3, \tmp2
|
||||||
|
xvilvh.w \out7, \tmp3, \tmp2
|
||||||
|
|
||||||
|
xmov \tmp0, \out0
|
||||||
|
xmov \tmp1, \out1
|
||||||
|
xmov \tmp2, \out2
|
||||||
|
xmov \tmp3, \out3
|
||||||
|
xvpermi.q \out0, \out4, 0x02
|
||||||
|
xvpermi.q \out1, \out5, 0x02
|
||||||
|
xvpermi.q \out2, \out6, 0x02
|
||||||
|
xvpermi.q \out3, \out7, 0x02
|
||||||
|
xvpermi.q \out4, \tmp0, 0x31
|
||||||
|
xvpermi.q \out5, \tmp1, 0x31
|
||||||
|
xvpermi.q \out6, \tmp2, 0x31
|
||||||
|
xvpermi.q \out7, \tmp3, 0x31
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Description : Transpose 4x4 block with double-word elements in vectors
|
||||||
|
* Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
* Outputs - out0, out1, out2, out3
|
||||||
|
* Example : LASX_TRANSPOSE4x4_D
|
||||||
|
* in0 : 1,2,3,4
|
||||||
|
* in1 : 1,2,3,4
|
||||||
|
* in2 : 1,2,3,4
|
||||||
|
* in3 : 1,2,3,4
|
||||||
|
*
|
||||||
|
* out0 : 1,1,1,1
|
||||||
|
* out1 : 2,2,2,2
|
||||||
|
* out2 : 3,3,3,3
|
||||||
|
* out3 : 4,4,4,4
|
||||||
|
*/
|
||||||
|
.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||||
|
tmp0, tmp1
|
||||||
|
xvilvl.d \tmp0, \in1, \in0
|
||||||
|
xvilvh.d \out1, \in1, \in0
|
||||||
|
xvilvh.d \tmp1, \in3, \in2
|
||||||
|
xvilvl.d \out2, \in3, \in2
|
||||||
|
|
||||||
|
xvor.v \out0, \tmp0, \tmp0
|
||||||
|
xvor.v \out3, \tmp1, \tmp1
|
||||||
|
|
||||||
|
xvpermi.q \out0, \out2, 0x02
|
||||||
|
xvpermi.q \out2, \tmp0, 0x31
|
||||||
|
xvpermi.q \out3, \out1, 0x31
|
||||||
|
xvpermi.q \out1, \tmp1, 0x02
|
||||||
|
.endm
|
||||||
47
common/loongarch/loongson_util.S
Normal file
47
common/loongarch/loongson_util.S
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* loongson_util.S: loongson utility macros
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||||
|
* Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define GLUE(a, b) a ## b
|
||||||
|
#define JOIN(a, b) GLUE(a, b)
|
||||||
|
|
||||||
|
/* Set prefix as needed. */
|
||||||
|
#define ASM_REF JOIN(JOIN(x264_, BIT_DEPTH), _)
|
||||||
|
|
||||||
|
#define FENC_STRIDE 16
|
||||||
|
#define FDEC_STRIDE 32
|
||||||
|
|
||||||
|
.macro function_x264 name, align=DEFAULT_ALIGN
|
||||||
|
.macro endfunc_x264
|
||||||
|
jirl $r0, $r1, 0x0
|
||||||
|
.size ASM_REF\name, . - ASM_REF\name
|
||||||
|
.purgem endfunc_x264
|
||||||
|
.endm
|
||||||
|
.text ;
|
||||||
|
.align \align ;
|
||||||
|
.globl ASM_REF\name ;
|
||||||
|
.type ASM_REF\name, @function ;
|
||||||
|
ASM_REF\name: ;
|
||||||
|
.endm
|
||||||
2702
common/loongarch/mc-a.S
Normal file
2702
common/loongarch/mc-a.S
Normal file
File diff suppressed because it is too large
Load Diff
406
common/loongarch/mc-c.c
Normal file
406
common/loongarch/mc-c.c
Normal file
@@ -0,0 +1,406 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc-c.c: loongarch motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "mc.h"
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
|
||||||
|
#define MC_WEIGHT_LSX(func) \
|
||||||
|
static void (* mc##func##_wtab_lsx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \
|
||||||
|
{ \
|
||||||
|
x264_mc_weight_w4##func##_lsx, \
|
||||||
|
x264_mc_weight_w4##func##_lsx, \
|
||||||
|
x264_mc_weight_w8##func##_lsx, \
|
||||||
|
x264_mc_weight_w16##func##_lsx, \
|
||||||
|
x264_mc_weight_w16##func##_lsx, \
|
||||||
|
x264_mc_weight_w20##func##_lsx, \
|
||||||
|
};
|
||||||
|
|
||||||
|
#define MC_WEIGHT(func) \
|
||||||
|
static void (* mc##func##_wtab_lasx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \
|
||||||
|
{ \
|
||||||
|
x264_mc_weight_w4##func##_lasx, \
|
||||||
|
x264_mc_weight_w4##func##_lasx, \
|
||||||
|
x264_mc_weight_w8##func##_lasx, \
|
||||||
|
x264_mc_weight_w16##func##_lasx, \
|
||||||
|
x264_mc_weight_w16##func##_lasx, \
|
||||||
|
x264_mc_weight_w20##func##_lasx, \
|
||||||
|
};
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
MC_WEIGHT_LSX()
|
||||||
|
MC_WEIGHT_LSX(_noden)
|
||||||
|
MC_WEIGHT()
|
||||||
|
MC_WEIGHT(_noden)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void weight_cache_lsx( x264_t *h, x264_weight_t *w )
|
||||||
|
{
|
||||||
|
if ( w->i_denom >= 1)
|
||||||
|
{
|
||||||
|
w->weightfn = mc_wtab_lsx;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
w->weightfn = mc_noden_wtab_lsx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static weight_fn_t mc_weight_wtab_lsx[6] =
|
||||||
|
{
|
||||||
|
x264_mc_weight_w4_lsx,
|
||||||
|
x264_mc_weight_w4_lsx,
|
||||||
|
x264_mc_weight_w8_lsx,
|
||||||
|
x264_mc_weight_w16_lsx,
|
||||||
|
x264_mc_weight_w16_lsx,
|
||||||
|
x264_mc_weight_w20_lsx,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void (* const pixel_avg_wtab_lsx[6])(uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_pixel_avg2_w4_lsx,
|
||||||
|
x264_pixel_avg2_w8_lsx,
|
||||||
|
x264_pixel_avg2_w16_lsx,
|
||||||
|
x264_pixel_avg2_w16_lsx,
|
||||||
|
x264_pixel_avg2_w20_lsx,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void (* const mc_copy_wtab_lsx[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w4_lsx,
|
||||||
|
x264_mc_copy_w8_lsx,
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w16_lsx,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void weight_cache_lasx( x264_t *h, x264_weight_t *w )
|
||||||
|
{
|
||||||
|
if ( w->i_denom >= 1)
|
||||||
|
{
|
||||||
|
w->weightfn = mc_wtab_lasx;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
w->weightfn = mc_noden_wtab_lasx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static weight_fn_t mc_weight_wtab_lasx[6] =
|
||||||
|
{
|
||||||
|
x264_mc_weight_w4_lasx,
|
||||||
|
x264_mc_weight_w4_lasx,
|
||||||
|
x264_mc_weight_w8_lasx,
|
||||||
|
x264_mc_weight_w16_lasx,
|
||||||
|
x264_mc_weight_w16_lasx,
|
||||||
|
x264_mc_weight_w20_lasx,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void (* const pixel_avg_wtab_lasx[6])(uint8_t *, intptr_t, uint8_t *,
|
||||||
|
intptr_t, uint8_t *, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_pixel_avg2_w4_lasx,
|
||||||
|
x264_pixel_avg2_w8_lasx,
|
||||||
|
x264_pixel_avg2_w16_lasx,
|
||||||
|
x264_pixel_avg2_w16_lasx,
|
||||||
|
x264_pixel_avg2_w20_lasx,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void (* const mc_copy_wtab_lasx[5])( uint8_t *, intptr_t, uint8_t *,
|
||||||
|
intptr_t, int ) =
|
||||||
|
{
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w4_lasx,
|
||||||
|
x264_mc_copy_w8_lasx,
|
||||||
|
NULL,
|
||||||
|
x264_mc_copy_w16_lasx,
|
||||||
|
};
|
||||||
|
|
||||||
|
static uint8_t *get_ref_lsx( uint8_t *p_dst, intptr_t *p_dst_stride,
|
||||||
|
uint8_t *p_src[4], intptr_t i_src_stride,
|
||||||
|
int32_t m_vx, int32_t m_vy,
|
||||||
|
int32_t i_width, int32_t i_height,
|
||||||
|
const x264_weight_t *pWeight )
|
||||||
|
{
|
||||||
|
int32_t i_qpel_idx;
|
||||||
|
int32_t i_offset;
|
||||||
|
uint8_t *p_src1;
|
||||||
|
int32_t r_vy = m_vy & 3;
|
||||||
|
int32_t r_vx = m_vx & 3;
|
||||||
|
int32_t width = i_width >> 2;
|
||||||
|
|
||||||
|
i_qpel_idx = ( r_vy << 2 ) + r_vx;
|
||||||
|
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
|
||||||
|
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
|
||||||
|
( 3 == r_vy ) * i_src_stride;
|
||||||
|
|
||||||
|
if( i_qpel_idx & 5 )
|
||||||
|
{
|
||||||
|
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
|
||||||
|
i_offset + ( 3 == r_vx );
|
||||||
|
pixel_avg_wtab_lsx[width](
|
||||||
|
p_dst, *p_dst_stride, p_src1, i_src_stride,
|
||||||
|
p_src2, i_height );
|
||||||
|
|
||||||
|
if( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height);
|
||||||
|
}
|
||||||
|
return p_dst;
|
||||||
|
}
|
||||||
|
else if ( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height );
|
||||||
|
return p_dst;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*p_dst_stride = i_src_stride;
|
||||||
|
return p_src1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mc_luma_lsx( uint8_t *p_dst, intptr_t i_dst_stride,
|
||||||
|
uint8_t *p_src[4], intptr_t i_src_stride,
|
||||||
|
int32_t m_vx, int32_t m_vy,
|
||||||
|
int32_t i_width, int32_t i_height,
|
||||||
|
const x264_weight_t *pWeight )
|
||||||
|
{
|
||||||
|
int32_t i_qpel_idx;
|
||||||
|
int32_t i_offset;
|
||||||
|
uint8_t *p_src1;
|
||||||
|
|
||||||
|
i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
|
||||||
|
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
|
||||||
|
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
|
||||||
|
( 3 == ( m_vy & 3 ) ) * i_src_stride;
|
||||||
|
|
||||||
|
if( i_qpel_idx & 5 )
|
||||||
|
{
|
||||||
|
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
|
||||||
|
i_offset + ( 3 == ( m_vx & 3 ) );
|
||||||
|
|
||||||
|
pixel_avg_wtab_lsx[i_width >> 2](
|
||||||
|
p_dst, i_dst_stride, p_src1, i_src_stride,
|
||||||
|
p_src2, i_height );
|
||||||
|
|
||||||
|
if( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
mc_copy_wtab_lsx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PLANE_INTERLEAVE(lsx)
|
||||||
|
PLANE_COPY_YUYV(32, lsx)
|
||||||
|
|
||||||
|
#define x264_mc_chroma_lsx x264_template(mc_chroma_lsx)
|
||||||
|
void x264_mc_chroma_lsx( uint8_t *p_dst_u, uint8_t *p_dst_v,
|
||||||
|
intptr_t i_dst_stride,
|
||||||
|
uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
int32_t m_vx, int32_t m_vy,
|
||||||
|
int32_t i_width, int32_t i_height );
|
||||||
|
|
||||||
|
static uint8_t *get_ref_lasx( uint8_t *p_dst, intptr_t *p_dst_stride,
|
||||||
|
uint8_t *p_src[4], intptr_t i_src_stride,
|
||||||
|
int32_t m_vx, int32_t m_vy,
|
||||||
|
int32_t i_width, int32_t i_height,
|
||||||
|
const x264_weight_t *pWeight )
|
||||||
|
{
|
||||||
|
int32_t i_qpel_idx;
|
||||||
|
int32_t i_offset;
|
||||||
|
uint8_t *p_src1;
|
||||||
|
int32_t r_vy = m_vy & 3;
|
||||||
|
int32_t r_vx = m_vx & 3;
|
||||||
|
int32_t width = i_width >> 2;
|
||||||
|
|
||||||
|
i_qpel_idx = ( r_vy << 2 ) + r_vx;
|
||||||
|
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
|
||||||
|
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
|
||||||
|
( 3 == r_vy ) * i_src_stride;
|
||||||
|
|
||||||
|
if( i_qpel_idx & 5 )
|
||||||
|
{
|
||||||
|
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
|
||||||
|
i_offset + ( 3 == r_vx );
|
||||||
|
pixel_avg_wtab_lasx[width](
|
||||||
|
p_dst, *p_dst_stride, p_src1, i_src_stride,
|
||||||
|
p_src2, i_height );
|
||||||
|
|
||||||
|
if( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height);
|
||||||
|
}
|
||||||
|
return p_dst;
|
||||||
|
}
|
||||||
|
else if ( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height );
|
||||||
|
return p_dst;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*p_dst_stride = i_src_stride;
|
||||||
|
return p_src1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mc_luma_lasx( uint8_t *p_dst, intptr_t i_dst_stride,
|
||||||
|
uint8_t *p_src[4], intptr_t i_src_stride,
|
||||||
|
int32_t m_vx, int32_t m_vy,
|
||||||
|
int32_t i_width, int32_t i_height,
|
||||||
|
const x264_weight_t *pWeight )
|
||||||
|
{
|
||||||
|
int32_t i_qpel_idx;
|
||||||
|
int32_t i_offset;
|
||||||
|
uint8_t *p_src1;
|
||||||
|
|
||||||
|
i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
|
||||||
|
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
|
||||||
|
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
|
||||||
|
( 3 == ( m_vy & 3 ) ) * i_src_stride;
|
||||||
|
|
||||||
|
if( i_qpel_idx & 5 )
|
||||||
|
{
|
||||||
|
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
|
||||||
|
i_offset + ( 3 == ( m_vx & 3 ) );
|
||||||
|
|
||||||
|
pixel_avg_wtab_lasx[i_width >> 2](
|
||||||
|
p_dst, i_dst_stride, p_src1, i_src_stride,
|
||||||
|
p_src2, i_height );
|
||||||
|
|
||||||
|
if( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if( pWeight->weightfn )
|
||||||
|
{
|
||||||
|
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
mc_copy_wtab_lasx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PLANE_COPY_YUYV(64, lasx)
|
||||||
|
|
||||||
|
#define x264_mc_chroma_lasx x264_template(mc_chroma_lasx)
|
||||||
|
void x264_mc_chroma_lasx( uint8_t *p_dst_u, uint8_t *p_dst_v,
|
||||||
|
intptr_t i_dst_stride,
|
||||||
|
uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
int32_t m_vx, int32_t m_vy,
|
||||||
|
int32_t i_width, int32_t i_height );
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
|
||||||
|
void x264_mc_init_loongarch( int32_t cpu, x264_mc_functions_t *pf )
|
||||||
|
{
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
if( cpu & X264_CPU_LSX )
|
||||||
|
{
|
||||||
|
pf->mc_luma = mc_luma_lsx;
|
||||||
|
pf->mc_chroma = x264_mc_chroma_lsx;
|
||||||
|
pf->get_ref = get_ref_lsx;
|
||||||
|
|
||||||
|
pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_lsx;
|
||||||
|
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lsx;
|
||||||
|
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lsx;
|
||||||
|
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lsx;
|
||||||
|
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lsx;
|
||||||
|
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lsx;
|
||||||
|
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lsx;
|
||||||
|
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lsx;
|
||||||
|
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lsx;
|
||||||
|
|
||||||
|
pf->weight = mc_weight_wtab_lsx;
|
||||||
|
pf->offsetadd = mc_weight_wtab_lsx;
|
||||||
|
pf->offsetsub = mc_weight_wtab_lsx;
|
||||||
|
pf->weight_cache = weight_cache_lsx;
|
||||||
|
|
||||||
|
pf->copy_16x16_unaligned = x264_mc_copy_w16_lsx;
|
||||||
|
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lsx;
|
||||||
|
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lsx;
|
||||||
|
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lsx;
|
||||||
|
|
||||||
|
pf->store_interleave_chroma = x264_store_interleave_chroma_lsx;
|
||||||
|
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_lsx;
|
||||||
|
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_lsx;
|
||||||
|
|
||||||
|
pf->plane_copy_interleave = plane_copy_interleave_lsx;
|
||||||
|
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lsx;
|
||||||
|
pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lsx;
|
||||||
|
|
||||||
|
pf->hpel_filter = x264_hpel_filter_lsx;
|
||||||
|
pf->memcpy_aligned = x264_memcpy_aligned_lsx;
|
||||||
|
pf->memzero_aligned = x264_memzero_aligned_lsx;
|
||||||
|
pf->frame_init_lowres_core = x264_frame_init_lowres_core_lsx;
|
||||||
|
|
||||||
|
pf->prefetch_fenc_420 = x264_prefetch_fenc_420_lsx;
|
||||||
|
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_lsx;
|
||||||
|
pf->prefetch_ref = x264_prefetch_ref_lsx;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( cpu & X264_CPU_LASX )
|
||||||
|
{
|
||||||
|
pf->mc_luma = mc_luma_lasx;
|
||||||
|
pf->mc_chroma = x264_mc_chroma_lasx;
|
||||||
|
pf->get_ref = get_ref_lasx;
|
||||||
|
|
||||||
|
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lasx;
|
||||||
|
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lasx;
|
||||||
|
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lasx;
|
||||||
|
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lasx;
|
||||||
|
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lasx;
|
||||||
|
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lasx;
|
||||||
|
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lasx;
|
||||||
|
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lasx;
|
||||||
|
|
||||||
|
pf->weight = mc_weight_wtab_lasx;
|
||||||
|
pf->offsetadd = mc_weight_wtab_lasx;
|
||||||
|
pf->offsetsub = mc_weight_wtab_lasx;
|
||||||
|
pf->weight_cache = weight_cache_lasx;
|
||||||
|
|
||||||
|
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lasx;
|
||||||
|
pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lasx;
|
||||||
|
|
||||||
|
pf->copy_16x16_unaligned = x264_mc_copy_w16_lasx;
|
||||||
|
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lasx;
|
||||||
|
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lasx;
|
||||||
|
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lasx;
|
||||||
|
|
||||||
|
pf->hpel_filter = x264_hpel_filter_lasx;
|
||||||
|
pf->memzero_aligned = x264_memzero_aligned_lasx;
|
||||||
|
pf->frame_init_lowres_core = x264_frame_init_lowres_core_lasx;
|
||||||
|
}
|
||||||
|
#endif // !HIGH_BIT_DEPTH
|
||||||
|
}
|
||||||
196
common/loongarch/mc.h
Normal file
196
common/loongarch/mc.h
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc.h: loongarch motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_LOONGARCH_MC_H
|
||||||
|
#define X264_LOONGARCH_MC_H
|
||||||
|
|
||||||
|
#define x264_mc_init_loongarch x264_template(mc_init_loongarch)
|
||||||
|
void x264_mc_init_loongarch( int cpu, x264_mc_functions_t *pf );
|
||||||
|
|
||||||
|
#define x264_pixel_avg_16x16_lsx x264_template(pixel_avg_16x16_lsx)
|
||||||
|
void x264_pixel_avg_16x16_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_16x8_lsx x264_template(pixel_avg_16x8_lsx)
|
||||||
|
void x264_pixel_avg_16x8_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x16_lsx x264_template(pixel_avg_8x16_lsx)
|
||||||
|
void x264_pixel_avg_8x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x8_lsx x264_template(pixel_avg_8x8_lsx)
|
||||||
|
void x264_pixel_avg_8x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x4_lsx x264_template(pixel_avg_8x4_lsx)
|
||||||
|
void x264_pixel_avg_8x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x16_lsx x264_template(pixel_avg_4x16_lsx)
|
||||||
|
void x264_pixel_avg_4x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x8_lsx x264_template(pixel_avg_4x8_lsx)
|
||||||
|
void x264_pixel_avg_4x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x4_lsx x264_template(pixel_avg_4x4_lsx)
|
||||||
|
void x264_pixel_avg_4x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x2_lsx x264_template(pixel_avg_4x2_lsx)
|
||||||
|
void x264_pixel_avg_4x2_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_pixel_avg2_w4_lsx x264_template(pixel_avg2_w4_lsx)
|
||||||
|
void x264_pixel_avg2_w4_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w8_lsx x264_template(pixel_avg2_w8_lsx)
|
||||||
|
void x264_pixel_avg2_w8_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w16_lsx x264_template(pixel_avg2_w16_lsx)
|
||||||
|
void x264_pixel_avg2_w16_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w20_lsx x264_template(pixel_avg2_w20_lsx)
|
||||||
|
void x264_pixel_avg2_w20_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
|
||||||
|
#define x264_mc_weight_w20_lsx x264_template(mc_weight_w20_lsx)
|
||||||
|
void x264_mc_weight_w20_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w20_noden_lsx x264_template(mc_weight_w20_noden_lsx)
|
||||||
|
void x264_mc_weight_w20_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w16_lsx x264_template(mc_weight_w16_lsx)
|
||||||
|
void x264_mc_weight_w16_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w16_noden_lsx x264_template(mc_weight_w16_noden_lsx)
|
||||||
|
void x264_mc_weight_w16_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w8_lsx x264_template(mc_weight_w8_lsx)
|
||||||
|
void x264_mc_weight_w8_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w8_noden_lsx x264_template(mc_weight_w8_noden_lsx)
|
||||||
|
void x264_mc_weight_w8_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w4_lsx x264_template(mc_weight_w4_lsx)
|
||||||
|
void x264_mc_weight_w4_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w4_noden_lsx x264_template(mc_weight_w4_noden_lsx)
|
||||||
|
void x264_mc_weight_w4_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
|
||||||
|
#define x264_mc_copy_w16_lsx x264_template(mc_copy_w16_lsx)
|
||||||
|
void x264_mc_copy_w16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w8_lsx x264_template(mc_copy_w8_lsx)
|
||||||
|
void x264_mc_copy_w8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w4_lsx x264_template(mc_copy_w4_lsx)
|
||||||
|
void x264_mc_copy_w4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_store_interleave_chroma_lsx x264_template(store_interleave_chroma_lsx)
|
||||||
|
void x264_store_interleave_chroma_lsx( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||||
|
#define x264_load_deinterleave_chroma_fenc_lsx x264_template(load_deinterleave_chroma_fenc_lsx)
|
||||||
|
void x264_load_deinterleave_chroma_fenc_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
#define x264_load_deinterleave_chroma_fdec_lsx x264_template(load_deinterleave_chroma_fdec_lsx)
|
||||||
|
void x264_load_deinterleave_chroma_fdec_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
|
||||||
|
#define x264_plane_copy_interleave_core_lsx x264_template(plane_copy_interleave_core_lsx)
|
||||||
|
void x264_plane_copy_interleave_core_lsx( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *srcu, intptr_t i_srcu,
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||||
|
#define x264_plane_copy_deinterleave_lsx x264_template(plane_copy_deinterleave_lsx)
|
||||||
|
void x264_plane_copy_deinterleave_lsx( pixel *dstu, intptr_t i_dstu,
|
||||||
|
pixel *dstv, intptr_t i_dstv,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
|
||||||
|
#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
|
||||||
|
void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
|
||||||
|
pixel *dstv, intptr_t i_dstv,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
|
||||||
|
#define x264_prefetch_fenc_420_lsx x264_template(prefetch_fenc_420_lsx)
|
||||||
|
void x264_prefetch_fenc_420_lsx( uint8_t *pix_y, intptr_t stride_y,
|
||||||
|
uint8_t *pix_uv, intptr_t stride_uv,
|
||||||
|
int32_t mb_x );
|
||||||
|
#define x264_prefetch_fenc_422_lsx x264_template(prefetch_fenc_422_lsx)
|
||||||
|
void x264_prefetch_fenc_422_lsx( uint8_t *pix_y, intptr_t stride_y,
|
||||||
|
uint8_t *pix_uv, intptr_t stride_uv,
|
||||||
|
int32_t mb_x );
|
||||||
|
#define x264_prefetch_ref_lsx x264_template(prefetch_ref_lsx)
|
||||||
|
void x264_prefetch_ref_lsx( uint8_t *pix, intptr_t stride, int32_t parity );
|
||||||
|
|
||||||
|
#define x264_memcpy_aligned_lsx x264_template(memcpy_aligned_lsx)
|
||||||
|
void *x264_memcpy_aligned_lsx( void *dst, const void *src, size_t n );
|
||||||
|
#define x264_memzero_aligned_lsx x264_template(memzero_aligned_lsx)
|
||||||
|
void x264_memzero_aligned_lsx( void *p_dst, size_t n );
|
||||||
|
|
||||||
|
#define x264_hpel_filter_lsx x264_template(hpel_filter_lsx)
|
||||||
|
void x264_hpel_filter_lsx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
|
||||||
|
#define x264_frame_init_lowres_core_lsx x264_template(frame_init_lowres_core_lsx)
|
||||||
|
void x264_frame_init_lowres_core_lsx( uint8_t *, uint8_t *, uint8_t *, uint8_t *,
|
||||||
|
uint8_t *, intptr_t, intptr_t, int, int );
|
||||||
|
|
||||||
|
#define x264_pixel_avg_16x8_lasx x264_template(pixel_avg_16x8_lasx)
|
||||||
|
void x264_pixel_avg_16x8_lasx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x16_lasx x264_template(pixel_avg_8x16_lasx)
|
||||||
|
void x264_pixel_avg_8x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x8_lasx x264_template(pixel_avg_8x8_lasx)
|
||||||
|
void x264_pixel_avg_8x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_8x4_lasx x264_template(pixel_avg_8x4_lasx)
|
||||||
|
void x264_pixel_avg_8x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x16_lasx x264_template(pixel_avg_4x16_lasx)
|
||||||
|
void x264_pixel_avg_4x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x8_lasx x264_template(pixel_avg_4x8_lasx)
|
||||||
|
void x264_pixel_avg_4x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x4_lasx x264_template(pixel_avg_4x4_lasx)
|
||||||
|
void x264_pixel_avg_4x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_pixel_avg_4x2_lasx x264_template(pixel_avg_4x2_lasx)
|
||||||
|
void x264_pixel_avg_4x2_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_pixel_avg2_w4_lasx x264_template(pixel_avg2_w4_lasx)
|
||||||
|
void x264_pixel_avg2_w4_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w8_lasx x264_template(pixel_avg2_w8_lasx)
|
||||||
|
void x264_pixel_avg2_w8_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w16_lasx x264_template(pixel_avg2_w16_lasx)
|
||||||
|
void x264_pixel_avg2_w16_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
#define x264_pixel_avg2_w20_lasx x264_template(pixel_avg2_w20_lasx)
|
||||||
|
void x264_pixel_avg2_w20_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||||
|
|
||||||
|
#define x264_mc_weight_w20_lasx x264_template(mc_weight_w20_lasx)
|
||||||
|
void x264_mc_weight_w20_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w20_noden_lasx x264_template(mc_weight_w20_noden_lasx)
|
||||||
|
void x264_mc_weight_w20_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w16_lasx x264_template(mc_weight_w16_lasx)
|
||||||
|
void x264_mc_weight_w16_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w16_noden_lasx x264_template(mc_weight_w16_noden_lasx)
|
||||||
|
void x264_mc_weight_w16_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w8_lasx x264_template(mc_weight_w8_lasx)
|
||||||
|
void x264_mc_weight_w8_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w8_noden_lasx x264_template(mc_weight_w8_noden_lasx)
|
||||||
|
void x264_mc_weight_w8_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w4_lasx x264_template(mc_weight_w4_lasx)
|
||||||
|
void x264_mc_weight_w4_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
#define x264_mc_weight_w4_noden_lasx x264_template(mc_weight_w4_noden_lasx)
|
||||||
|
void x264_mc_weight_w4_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
|
||||||
|
|
||||||
|
#define x264_mc_copy_w16_lasx x264_template(mc_copy_w16_lasx)
|
||||||
|
void x264_mc_copy_w16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w8_lasx x264_template(mc_copy_w8_lasx)
|
||||||
|
void x264_mc_copy_w8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
#define x264_mc_copy_w4_lasx x264_template(mc_copy_w4_lasx)
|
||||||
|
void x264_mc_copy_w4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||||
|
|
||||||
|
#define x264_plane_copy_interleave_core_lasx x264_template(plane_copy_interleave_core_lasx)
|
||||||
|
void x264_plane_copy_interleave_core_lasx( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *srcu, intptr_t i_srcu,
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||||
|
|
||||||
|
#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
|
||||||
|
void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
|
||||||
|
pixel *dstv, intptr_t i_dstv,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
|
||||||
|
#define x264_memzero_aligned_lasx x264_template(memzero_aligned_lasx)
|
||||||
|
void x264_memzero_aligned_lasx( void *p_dst, size_t n );
|
||||||
|
|
||||||
|
#define x264_hpel_filter_lasx x264_template(hpel_filter_lasx)
|
||||||
|
void x264_hpel_filter_lasx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
|
||||||
|
#define x264_frame_init_lowres_core_lasx x264_template(frame_init_lowres_core_lasx)
|
||||||
|
void x264_frame_init_lowres_core_lasx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *,
|
||||||
|
intptr_t, intptr_t, int, int );
|
||||||
|
|
||||||
|
#endif
|
||||||
3548
common/loongarch/pixel-a.S
Normal file
3548
common/loongarch/pixel-a.S
Normal file
File diff suppressed because it is too large
Load Diff
259
common/loongarch/pixel-c.c
Normal file
259
common/loongarch/pixel-c.c
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* pixel-c.c: loongarch pixel metrics
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Hecai Yuan <yuanhecai@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "pixel.h"
|
||||||
|
#include "predict.h"
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_sum;
|
||||||
|
|
||||||
|
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
|
||||||
|
|
||||||
|
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_sum;
|
||||||
|
|
||||||
|
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
|
||||||
|
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride );
|
||||||
|
|
||||||
|
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_sum;
|
||||||
|
|
||||||
|
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
|
||||||
|
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride );
|
||||||
|
|
||||||
|
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_sum;
|
||||||
|
|
||||||
|
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
|
||||||
|
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride );
|
||||||
|
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride );
|
||||||
|
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride + 8, i_stride );
|
||||||
|
|
||||||
|
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_sum;
|
||||||
|
|
||||||
|
u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride );
|
||||||
|
|
||||||
|
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_sum;
|
||||||
|
|
||||||
|
u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride );
|
||||||
|
u_sum += x264_hadamard_ac_8x8_lasx( p_pix + ( i_stride << 3 ), i_stride );
|
||||||
|
|
||||||
|
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
|
||||||
|
|
||||||
|
x264_predict_8x8_v_lsx( pix, p_edge );
|
||||||
|
p_sad_array[0] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8_h_lsx( pix, p_edge );
|
||||||
|
p_sad_array[1] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8_dc_lsx( pix, p_edge );
|
||||||
|
p_sad_array[2] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
|
||||||
|
|
||||||
|
x264_predict_8x8_v_lsx( pix, p_edge );
|
||||||
|
p_sad_array[0] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8_h_lasx( pix, p_edge );
|
||||||
|
p_sad_array[1] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8_dc_lsx( pix, p_edge );
|
||||||
|
p_sad_array[2] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
x264_predict_4x4_v_lsx( p_dec );
|
||||||
|
p_sad_array[0] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_4x4_h_lsx( p_dec );
|
||||||
|
p_sad_array[1] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_4x4_dc_lsx( p_dec );
|
||||||
|
p_sad_array[2] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
x264_predict_16x16_v_lsx( p_dec );
|
||||||
|
p_sad_array[0] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_16x16_h_lsx( p_dec );
|
||||||
|
p_sad_array[1] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_16x16_dc_lsx( p_dec );
|
||||||
|
p_sad_array[2] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
x264_predict_16x16_v_lsx( p_dec );
|
||||||
|
p_sad_array[0] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_16x16_h_lsx( p_dec );
|
||||||
|
p_sad_array[1] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_16x16_dc_lsx( p_dec );
|
||||||
|
p_sad_array[2] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
x264_predict_8x8c_dc_lsx( p_dec );
|
||||||
|
p_sad_array[0] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8c_h_lsx( p_dec );
|
||||||
|
p_sad_array[1] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8c_v_lsx( p_dec );
|
||||||
|
p_sad_array[2] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
x264_predict_4x4_v_lsx( p_dec );
|
||||||
|
p_sad_array[0] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_4x4_h_lsx( p_dec );
|
||||||
|
p_sad_array[1] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_4x4_dc_lsx( p_dec );
|
||||||
|
p_sad_array[2] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
x264_predict_16x16_v_lsx( p_dec );
|
||||||
|
p_sad_array[0] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_16x16_h_lsx( p_dec );
|
||||||
|
p_sad_array[1] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_16x16_dc_lsx( p_dec );
|
||||||
|
p_sad_array[2] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
|
||||||
|
|
||||||
|
x264_predict_8x8_v_lsx( pix, p_edge );
|
||||||
|
p_sad_array[0] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8_h_lsx( pix, p_edge );
|
||||||
|
p_sad_array[1] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8_dc_lsx( pix, p_edge );
|
||||||
|
p_sad_array[2] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] )
|
||||||
|
{
|
||||||
|
x264_predict_8x8c_dc_lsx( p_dec );
|
||||||
|
p_sad_array[0] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8c_h_lsx( p_dec );
|
||||||
|
p_sad_array[1] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
|
||||||
|
x264_predict_8x8c_v_lsx( p_dec );
|
||||||
|
p_sad_array[2] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
|
||||||
|
p_enc, FENC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
335
common/loongarch/pixel.h
Normal file
335
common/loongarch/pixel.h
Normal file
@@ -0,0 +1,335 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* pixel.h: loongarch pixel metrics
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Lu Wang <wanglu@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_LOONGARCH_PIXEL_H
|
||||||
|
#define X264_LOONGARCH_PIXEL_H
|
||||||
|
|
||||||
|
#define x264_pixel_satd_4x4_lsx x264_template(pixel_satd_4x4_lsx)
|
||||||
|
int32_t x264_pixel_satd_4x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_4x8_lsx x264_template(pixel_satd_4x8_lsx)
|
||||||
|
int32_t x264_pixel_satd_4x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_4x16_lsx x264_template(pixel_satd_4x16_lsx)
|
||||||
|
int32_t x264_pixel_satd_4x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x4_lsx x264_template(pixel_satd_8x4_lsx)
|
||||||
|
int32_t x264_pixel_satd_8x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x8_lsx x264_template(pixel_satd_8x8_lsx)
|
||||||
|
int32_t x264_pixel_satd_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x16_lsx x264_template(pixel_satd_8x16_lsx)
|
||||||
|
int32_t x264_pixel_satd_8x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_16x8_lsx x264_template(pixel_satd_16x8_lsx)
|
||||||
|
int32_t x264_pixel_satd_16x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_16x16_lsx x264_template(pixel_satd_16x16_lsx)
|
||||||
|
int32_t x264_pixel_satd_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
|
||||||
|
#define x264_pixel_satd_4x8_lasx x264_template(pixel_satd_4x8_lasx)
|
||||||
|
int32_t x264_pixel_satd_4x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_4x16_lasx x264_template(pixel_satd_4x16_lasx)
|
||||||
|
int32_t x264_pixel_satd_4x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x4_lasx x264_template(pixel_satd_8x4_lasx)
|
||||||
|
int32_t x264_pixel_satd_8x4_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x8_lasx x264_template(pixel_satd_8x8_lasx)
|
||||||
|
int32_t x264_pixel_satd_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x16_lasx x264_template(pixel_satd_8x16_lasx)
|
||||||
|
int32_t x264_pixel_satd_8x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_16x8_lasx x264_template(pixel_satd_16x8_lasx)
|
||||||
|
int32_t x264_pixel_satd_16x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_16x16_lasx x264_template(pixel_satd_16x16_lasx)
|
||||||
|
int32_t x264_pixel_satd_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
|
||||||
|
#define x264_pixel_sad_x4_16x16_lsx x264_template(pixel_sad_x4_16x16_lsx)
|
||||||
|
void x264_pixel_sad_x4_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_16x8_lsx x264_template(pixel_sad_x4_16x8_lsx)
|
||||||
|
void x264_pixel_sad_x4_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x16_lsx x264_template(pixel_sad_x4_8x16_lsx)
|
||||||
|
void x264_pixel_sad_x4_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x8_lsx x264_template(pixel_sad_x4_8x8_lsx)
|
||||||
|
void x264_pixel_sad_x4_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x4_lsx x264_template(pixel_sad_x4_8x4_lsx)
|
||||||
|
void x264_pixel_sad_x4_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_4x8_lsx x264_template(pixel_sad_x4_4x8_lsx)
|
||||||
|
void x264_pixel_sad_x4_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
|
||||||
|
#define x264_pixel_sad_x4_16x16_lasx x264_template(pixel_sad_x4_16x16_lasx)
|
||||||
|
void x264_pixel_sad_x4_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_16x8_lasx x264_template(pixel_sad_x4_16x8_lasx)
|
||||||
|
void x264_pixel_sad_x4_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x8_lasx x264_template(pixel_sad_x4_8x8_lasx)
|
||||||
|
void x264_pixel_sad_x4_8x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x4_lasx x264_template(pixel_sad_x4_8x4_lasx)
|
||||||
|
void x264_pixel_sad_x4_8x4_lasx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_4x4_lsx x264_template(pixel_sad_x4_4x4_lsx)
|
||||||
|
void x264_pixel_sad_x4_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
|
||||||
|
#define x264_pixel_sad_x3_16x16_lsx x264_template(pixel_sad_x3_16x16_lsx)
|
||||||
|
void x264_pixel_sad_x3_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_16x8_lsx x264_template(pixel_sad_x3_16x8_lsx)
|
||||||
|
void x264_pixel_sad_x3_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_8x16_lsx x264_template(pixel_sad_x3_8x16_lsx)
|
||||||
|
void x264_pixel_sad_x3_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_8x8_lsx x264_template(pixel_sad_x3_8x8_lsx)
|
||||||
|
void x264_pixel_sad_x3_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_8x4_lsx x264_template(pixel_sad_x3_8x4_lsx)
|
||||||
|
void x264_pixel_sad_x3_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_4x4_lsx x264_template(pixel_sad_x3_4x4_lsx)
|
||||||
|
void x264_pixel_sad_x3_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_4x8_lsx x264_template(pixel_sad_x3_4x8_lsx)
|
||||||
|
void x264_pixel_sad_x3_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
|
||||||
|
#define x264_pixel_sad_x3_16x16_lasx x264_template(pixel_sad_x3_16x16_lasx)
|
||||||
|
void x264_pixel_sad_x3_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_16x8_lasx x264_template(pixel_sad_x3_16x8_lasx)
|
||||||
|
void x264_pixel_sad_x3_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
|
||||||
|
#define x264_pixel_sad_16x16_lsx x264_template(pixel_sad_16x16_lsx)
|
||||||
|
int32_t x264_pixel_sad_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_16x8_lsx x264_template(pixel_sad_16x8_lsx)
|
||||||
|
int32_t x264_pixel_sad_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_8x16_lsx x264_template(pixel_sad_8x16_lsx)
|
||||||
|
int32_t x264_pixel_sad_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_8x8_lsx x264_template(pixel_sad_8x8_lsx)
|
||||||
|
int32_t x264_pixel_sad_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_8x4_lsx x264_template(pixel_sad_8x4_lsx)
|
||||||
|
int32_t x264_pixel_sad_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_4x16_lsx x264_template(pixel_sad_4x16_lsx)
|
||||||
|
int32_t x264_pixel_sad_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_4x8_lsx x264_template(pixel_sad_4x8_lsx)
|
||||||
|
int32_t x264_pixel_sad_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_4x4_lsx x264_template(pixel_sad_4x4_lsx)
|
||||||
|
int32_t x264_pixel_sad_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
|
||||||
|
#define x264_pixel_sad_8x4_lasx x264_template(pixel_sad_8x4_lasx)
|
||||||
|
int32_t x264_pixel_sad_8x4_lasx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
|
||||||
|
#define x264_hadamard_ac_8x8_lsx x264_template(hadamard_ac_8x8_lsx)
|
||||||
|
uint64_t x264_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_8x8_lsx x264_template(pixel_hadamard_ac_8x8_lsx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_8x16_lsx x264_template(pixel_hadamard_ac_8x16_lsx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_16x8_lsx x264_template(pixel_hadamard_ac_16x8_lsx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_16x16_lsx x264_template(pixel_hadamard_ac_16x16_lsx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
|
||||||
|
#define x264_hadamard_ac_8x8_lasx x264_template(hadamard_ac_8x8_lasx)
|
||||||
|
uint64_t x264_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_8x8_lasx x264_template(pixel_hadamard_ac_8x8_lasx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_8x16_lasx x264_template(pixel_hadamard_ac_8x16_lasx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_16x8_lasx x264_template(pixel_hadamard_ac_16x8_lasx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x8_lasx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_16x16_lasx x264_template(pixel_hadamard_ac_16x16_lasx)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x16_lasx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
|
||||||
|
#define x264_intra_satd_x3_16x16_lsx x264_template(intra_satd_x3_16x16_lsx)
|
||||||
|
void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_satd_x3_8x8c_lsx x264_template(intra_satd_x3_8x8c_lsx)
|
||||||
|
void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_satd_x3_4x4_lsx x264_template(intra_satd_x3_4x4_lsx)
|
||||||
|
void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_satd_x3_16x16_lasx x264_template(intra_satd_x3_16x16_lasx)
|
||||||
|
void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
|
||||||
|
#define x264_pixel_ssd_16x16_lsx x264_template(pixel_ssd_16x16_lsx)
|
||||||
|
int32_t x264_pixel_ssd_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_16x8_lsx x264_template(pixel_ssd_16x8_lsx)
|
||||||
|
int32_t x264_pixel_ssd_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x16_lsx x264_template(pixel_ssd_8x16_lsx)
|
||||||
|
int32_t x264_pixel_ssd_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x8_lsx x264_template(pixel_ssd_8x8_lsx)
|
||||||
|
int32_t x264_pixel_ssd_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x4_lsx x264_template(pixel_ssd_8x4_lsx)
|
||||||
|
int32_t x264_pixel_ssd_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_4x16_lsx x264_template(pixel_ssd_4x16_lsx)
|
||||||
|
int32_t x264_pixel_ssd_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_4x8_lsx x264_template(pixel_ssd_4x8_lsx)
|
||||||
|
int32_t x264_pixel_ssd_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_4x4_lsx x264_template(pixel_ssd_4x4_lsx)
|
||||||
|
int32_t x264_pixel_ssd_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
|
||||||
|
#define x264_pixel_ssd_16x16_lasx x264_template(pixel_ssd_16x16_lasx)
|
||||||
|
int32_t x264_pixel_ssd_16x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_16x8_lasx x264_template(pixel_ssd_16x8_lasx)
|
||||||
|
int32_t x264_pixel_ssd_16x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x16_lasx x264_template(pixel_ssd_8x16_lasx)
|
||||||
|
int32_t x264_pixel_ssd_8x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x8_lasx x264_template(pixel_ssd_8x8_lasx)
|
||||||
|
int32_t x264_pixel_ssd_8x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
|
||||||
|
#define x264_pixel_var2_8x16_lsx x264_template(pixel_var2_8x16_lsx)
|
||||||
|
int32_t x264_pixel_var2_8x16_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
|
||||||
|
int32_t ssd[2] );
|
||||||
|
#define x264_pixel_var2_8x8_lsx x264_template(pixel_var2_8x8_lsx)
|
||||||
|
int32_t x264_pixel_var2_8x8_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
|
||||||
|
int32_t ssd[2] );
|
||||||
|
#define x264_pixel_var_16x16_lsx x264_template(pixel_var_16x16_lsx)
|
||||||
|
uint64_t x264_pixel_var_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_var_8x16_lsx x264_template(pixel_var_8x16_lsx)
|
||||||
|
uint64_t x264_pixel_var_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_var_8x8_lsx x264_template(pixel_var_8x8_lsx)
|
||||||
|
uint64_t x264_pixel_var_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
|
||||||
|
#define x264_pixel_var2_8x16_lasx x264_template(pixel_var2_8x16_lasx)
|
||||||
|
int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
|
||||||
|
int32_t ssd[2] );
|
||||||
|
#define x264_pixel_var2_8x8_lasx x264_template(pixel_var2_8x8_lasx)
|
||||||
|
int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
|
||||||
|
int32_t ssd[2] );
|
||||||
|
|
||||||
|
#define x264_pixel_sa8d_8x8_lsx x264_template(pixel_sa8d_8x8_lsx)
|
||||||
|
int32_t x264_pixel_sa8d_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_sa8d_16x16_lsx x264_template(pixel_sa8d_16x16_lsx)
|
||||||
|
int32_t x264_pixel_sa8d_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
|
||||||
|
#define x264_intra_sa8d_x3_8x8_lsx x264_template(intra_sa8d_x3_8x8_lsx)
|
||||||
|
void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sa8d_x3_8x8_lasx x264_template(intra_sa8d_x3_8x8_lasx)
|
||||||
|
void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sa8d_8x8_lasx x264_template(pixel_sa8d_8x8_lasx)
|
||||||
|
int32_t x264_pixel_sa8d_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_sa8d_16x16_lasx x264_template(pixel_sa8d_16x16_lasx)
|
||||||
|
int32_t x264_pixel_sa8d_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
|
||||||
|
#define x264_intra_sad_x3_16x16_lsx x264_template(intra_sad_x3_16x16_lsx)
|
||||||
|
void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sad_x3_8x8_lsx x264_template(intra_sad_x3_8x8_lsx)
|
||||||
|
void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sad_x3_8x8c_lsx x264_template(intra_sad_x3_8x8c_lsx)
|
||||||
|
void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sad_x3_4x4_lsx x264_template(intra_sad_x3_4x4_lsx)
|
||||||
|
void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
|
||||||
|
#endif
|
||||||
1383
common/loongarch/predict-a.S
Normal file
1383
common/loongarch/predict-a.S
Normal file
File diff suppressed because it is too large
Load Diff
106
common/loongarch/predict-c.c
Normal file
106
common/loongarch/predict-c.c
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict-c.c: loongarch intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "predict.h"
|
||||||
|
|
||||||
|
void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
if( cpu&X264_CPU_LSX )
|
||||||
|
{
|
||||||
|
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_lsx;
|
||||||
|
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_lsx;
|
||||||
|
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_lsx;
|
||||||
|
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_lsx;
|
||||||
|
pf[I_PRED_16x16_DC_TOP ]= x264_predict_16x16_dc_top_lsx;
|
||||||
|
pf[I_PRED_16x16_DC_128 ]= x264_predict_16x16_dc_128_lsx;
|
||||||
|
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lsx;
|
||||||
|
}
|
||||||
|
if( cpu&X264_CPU_LASX )
|
||||||
|
{
|
||||||
|
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lasx;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] )
|
||||||
|
{
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
if( cpu&X264_CPU_LSX )
|
||||||
|
{
|
||||||
|
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_lsx;
|
||||||
|
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_lsx;
|
||||||
|
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_lsx;
|
||||||
|
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_lsx;
|
||||||
|
pf[I_PRED_CHROMA_DC_128] = x264_predict_8x8c_dc_128_lsx;
|
||||||
|
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_lsx;
|
||||||
|
pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x8c_dc_left_lsx;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
|
||||||
|
{
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
if( cpu&X264_CPU_LSX )
|
||||||
|
{
|
||||||
|
pf[I_PRED_8x8_V] = x264_predict_8x8_v_lsx;
|
||||||
|
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_lsx;
|
||||||
|
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_lsx;
|
||||||
|
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_lsx;
|
||||||
|
pf[I_PRED_8x8_DC_128] = x264_predict_8x8_dc_128_lsx;
|
||||||
|
pf[I_PRED_8x8_H] = x264_predict_8x8_h_lsx;
|
||||||
|
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lsx;
|
||||||
|
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lsx;
|
||||||
|
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lsx;
|
||||||
|
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lsx;
|
||||||
|
}
|
||||||
|
if( cpu&X264_CPU_LASX )
|
||||||
|
{
|
||||||
|
pf[I_PRED_8x8_H] = x264_predict_8x8_h_lasx;
|
||||||
|
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lasx;
|
||||||
|
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lasx;
|
||||||
|
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lasx;
|
||||||
|
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lasx;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] )
|
||||||
|
{
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
if( cpu&X264_CPU_LSX )
|
||||||
|
{
|
||||||
|
pf[I_PRED_4x4_V] = x264_predict_4x4_v_lsx;
|
||||||
|
pf[I_PRED_4x4_H] = x264_predict_4x4_h_lsx;
|
||||||
|
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_lsx;
|
||||||
|
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_lsx;
|
||||||
|
pf[I_PRED_4x4_DC_LEFT]= x264_predict_4x4_dc_left_lsx;
|
||||||
|
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_lsx;
|
||||||
|
pf[I_PRED_4x4_DC_128] = x264_predict_4x4_dc_128_lsx;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
150
common/loongarch/predict.h
Normal file
150
common/loongarch/predict.h
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict.h: loongarch intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_LOONGARCH_PREDICT_H
|
||||||
|
#define X264_LOONGARCH_PREDICT_H
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_p_lsx x264_template(predict_8x8c_p_lsx)
|
||||||
|
void x264_predict_8x8c_p_lsx(uint8_t *p_src);
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_v_lsx x264_template(predict_8x8c_v_lsx)
|
||||||
|
void x264_predict_8x8c_v_lsx(uint8_t *p_src);
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_h_lsx x264_template(predict_8x8c_h_lsx)
|
||||||
|
void x264_predict_8x8c_h_lsx(uint8_t *p_src);
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_dc_lsx x264_template(predict_8x8c_dc_lsx)
|
||||||
|
void x264_predict_8x8c_dc_lsx(pixel *src);
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_dc_128_lsx x264_template(predict_8x8c_dc_128_lsx)
|
||||||
|
void x264_predict_8x8c_dc_128_lsx(pixel *src);
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_dc_top_lsx x264_template(predict_8x8c_dc_top_lsx)
|
||||||
|
void x264_predict_8x8c_dc_top_lsx(pixel *src);
|
||||||
|
|
||||||
|
#define x264_predict_8x8c_dc_left_lsx x264_template(predict_8x8c_dc_left_lsx)
|
||||||
|
void x264_predict_8x8c_dc_left_lsx(pixel *src);
|
||||||
|
|
||||||
|
#define x264_predict_16x16_dc_lsx x264_template(predict_16x16_dc_lsx)
|
||||||
|
void x264_predict_16x16_dc_lsx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_dc_left_lsx x264_template(predict_16x16_dc_left_lsx)
|
||||||
|
void x264_predict_16x16_dc_left_lsx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_dc_top_lsx x264_template(predict_16x16_dc_top_lsx)
|
||||||
|
void x264_predict_16x16_dc_top_lsx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_dc_128_lsx x264_template(predict_16x16_dc_128_lsx)
|
||||||
|
void x264_predict_16x16_dc_128_lsx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_h_lsx x264_template(predict_16x16_h_lsx)
|
||||||
|
void x264_predict_16x16_h_lsx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_v_lsx x264_template(predict_16x16_v_lsx)
|
||||||
|
void x264_predict_16x16_v_lsx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_p_lasx x264_template(predict_16x16_p_lasx)
|
||||||
|
void x264_predict_16x16_p_lasx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_16x16_p_lsx x264_template(predict_16x16_p_lsx)
|
||||||
|
void x264_predict_16x16_p_lsx( pixel *src );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_v_lsx x264_template(predict_8x8_v_lsx)
|
||||||
|
void x264_predict_8x8_v_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_h_lasx x264_template(predict_8x8_h_lasx)
|
||||||
|
void x264_predict_8x8_h_lasx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_h_lsx x264_template(predict_8x8_h_lsx)
|
||||||
|
void x264_predict_8x8_h_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_dc_lsx x264_template(predict_8x8_dc_lsx)
|
||||||
|
void x264_predict_8x8_dc_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_dc_left_lsx x264_template(predict_8x8_dc_left_lsx)
|
||||||
|
void x264_predict_8x8_dc_left_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_dc_top_lsx x264_template(predict_8x8_dc_top_lsx)
|
||||||
|
void x264_predict_8x8_dc_top_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_dc_128_lsx x264_template(predict_8x8_dc_128_lsx)
|
||||||
|
void x264_predict_8x8_dc_128_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_ddl_lasx x264_template(predict_8x8_ddl_lasx)
|
||||||
|
void x264_predict_8x8_ddl_lasx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_ddl_lsx x264_template(predict_8x8_ddl_lsx)
|
||||||
|
void x264_predict_8x8_ddl_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_ddr_lasx x264_template(predict_8x8_ddr_lasx)
|
||||||
|
void x264_predict_8x8_ddr_lasx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_ddr_lsx x264_template(predict_8x8_ddr_lsx)
|
||||||
|
void x264_predict_8x8_ddr_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_vr_lasx x264_template(predict_8x8_vr_lasx)
|
||||||
|
void x264_predict_8x8_vr_lasx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_vr_lsx x264_template(predict_8x8_vr_lsx)
|
||||||
|
void x264_predict_8x8_vr_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_vl_lasx x264_template(predict_8x8_vl_lasx)
|
||||||
|
void x264_predict_8x8_vl_lasx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_8x8_vl_lsx x264_template(predict_8x8_vl_lsx)
|
||||||
|
void x264_predict_8x8_vl_lsx( pixel *src, pixel edge[36] );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_v_lsx x264_template(predict_4x4_v_lsx)
|
||||||
|
void x264_predict_4x4_v_lsx( pixel *p_src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_h_lsx x264_template(predict_4x4_h_lsx)
|
||||||
|
void x264_predict_4x4_h_lsx( pixel *p_src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_dc_lsx x264_template(predict_4x4_dc_lsx)
|
||||||
|
void x264_predict_4x4_dc_lsx( pixel *p_src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_ddl_lsx x264_template(predict_4x4_ddl_lsx)
|
||||||
|
void x264_predict_4x4_ddl_lsx( pixel *p_src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_dc_top_lsx x264_template(predict_4x4_dc_top_lsx)
|
||||||
|
void x264_predict_4x4_dc_top_lsx( pixel *p_src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_dc_left_lsx x264_template(predict_4x4_dc_left_lsx)
|
||||||
|
void x264_predict_4x4_dc_left_lsx( pixel *p_src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_dc_128_lsx x264_template(predict_4x4_dc_128_lsx)
|
||||||
|
void x264_predict_4x4_dc_128_lsx( pixel *p_src );
|
||||||
|
|
||||||
|
#define x264_predict_4x4_init_loongarch x264_template(predict_4x4_init_loongarch)
|
||||||
|
void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] );
|
||||||
|
#define x264_predict_8x8_init_loongarch x264_template(predict_8x8_init_loongarch)
|
||||||
|
void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12],
|
||||||
|
x264_predict_8x8_filter_t *predict_filter );
|
||||||
|
#define x264_predict_8x8c_init_loongarch x264_template(predict_8x8c_init_loongarch)
|
||||||
|
void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] );
|
||||||
|
#define x264_predict_16x16_init_loongarch x264_template(predict_16x16_init_loongarch)
|
||||||
|
void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] );
|
||||||
|
|
||||||
|
#endif
|
||||||
1231
common/loongarch/quant-a.S
Normal file
1231
common/loongarch/quant-a.S
Normal file
File diff suppressed because it is too large
Load Diff
96
common/loongarch/quant.h
Normal file
96
common/loongarch/quant.h
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* quant.h: loongarch quantization and level-run
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2023-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_LOONGARCH_QUANT_H
|
||||||
|
#define X264_LOONGARCH_QUANT_H
|
||||||
|
|
||||||
|
#define x264_coeff_last64_lsx x264_template(coeff_last64_lsx)
|
||||||
|
int32_t x264_coeff_last64_lsx( int16_t *p_src );
|
||||||
|
#define x264_coeff_last16_lsx x264_template(coeff_last16_lsx)
|
||||||
|
int32_t x264_coeff_last16_lsx( int16_t *p_src );
|
||||||
|
#define x264_coeff_last15_lsx x264_template(coeff_last15_lsx)
|
||||||
|
int32_t x264_coeff_last15_lsx( int16_t *p_src );
|
||||||
|
#define x264_coeff_last8_lsx x264_template(coeff_last8_lsx)
|
||||||
|
int32_t x264_coeff_last8_lsx( int16_t *p_src );
|
||||||
|
#define x264_coeff_last4_lsx x264_template(coeff_last4_lsx)
|
||||||
|
int32_t x264_coeff_last4_lsx( int16_t *p_src );
|
||||||
|
|
||||||
|
#define x264_quant_4x4_lsx x264_template(quant_4x4_lsx)
|
||||||
|
int32_t x264_quant_4x4_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
|
||||||
|
#define x264_quant_4x4x4_lsx x264_template(quant_4x4x4_lsx)
|
||||||
|
int32_t x264_quant_4x4x4_lsx( int16_t p_dct[4][16],
|
||||||
|
uint16_t pu_mf[16], uint16_t pu_bias[16] );
|
||||||
|
#define x264_quant_8x8_lsx x264_template(quant_8x8_lsx)
|
||||||
|
int32_t x264_quant_8x8_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
|
||||||
|
#define x264_quant_4x4_dc_lsx x264_template(quant_4x4_dc_lsx)
|
||||||
|
int32_t x264_quant_4x4_dc_lsx( dctcoef dct[16], int32_t mf, int32_t bias );
|
||||||
|
#define x264_quant_2x2_dc_lsx x264_template(quant_2x2_dc_lsx)
|
||||||
|
int32_t x264_quant_2x2_dc_lsx( dctcoef dct[4], int32_t mf, int32_t bias );
|
||||||
|
|
||||||
|
#define x264_dequant_4x4_lsx x264_template(dequant_4x4_lsx)
|
||||||
|
void x264_dequant_4x4_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
#define x264_dequant_8x8_lsx x264_template(dequant_8x8_lsx)
|
||||||
|
void x264_dequant_8x8_lsx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||||
|
#define x264_dequant_4x4_dc_lsx x264_template(dequant_4x4_dc_lsx)
|
||||||
|
void x264_dequant_4x4_dc_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
|
||||||
|
#define x264_decimate_score15_lsx x264_template(decimate_score15_lsx)
|
||||||
|
int x264_decimate_score15_lsx( dctcoef *dct );
|
||||||
|
#define x264_decimate_score16_lsx x264_template(decimate_score16_lsx)
|
||||||
|
int x264_decimate_score16_lsx( dctcoef *dct );
|
||||||
|
#define x264_decimate_score64_lsx x264_template(decimate_score64_lsx)
|
||||||
|
int x264_decimate_score64_lsx( dctcoef *dct );
|
||||||
|
|
||||||
|
#define x264_coeff_last64_lasx x264_template(coeff_last64_lasx)
|
||||||
|
int32_t x264_coeff_last64_lasx( int16_t *p_src );
|
||||||
|
#define x264_coeff_last16_lasx x264_template(coeff_last16_lasx)
|
||||||
|
int32_t x264_coeff_last16_lasx( int16_t *p_src );
|
||||||
|
#define x264_coeff_last15_lasx x264_template(coeff_last15_lasx)
|
||||||
|
int32_t x264_coeff_last15_lasx( int16_t *p_src );
|
||||||
|
|
||||||
|
#define x264_quant_4x4x4_lasx x264_template(quant_4x4x4_lasx)
|
||||||
|
int32_t x264_quant_4x4x4_lasx( int16_t p_dct[4][16],
|
||||||
|
uint16_t pu_mf[16], uint16_t pu_bias[16] );
|
||||||
|
|
||||||
|
#define x264_dequant_4x4_lasx x264_template(dequant_4x4_lasx)
|
||||||
|
void x264_dequant_4x4_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
#define x264_dequant_8x8_lasx x264_template(dequant_8x8_lasx)
|
||||||
|
void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||||
|
#define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
|
||||||
|
void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||||
|
|
||||||
|
#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
|
||||||
|
int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
|
||||||
|
#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
|
||||||
|
int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
|
||||||
|
|
||||||
|
#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
|
||||||
|
int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
|
||||||
|
#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
|
||||||
|
int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
|
||||||
|
#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
|
||||||
|
int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
|
||||||
|
|
||||||
|
#endif/* X264_LOONGARCH_QUANT_H */
|
||||||
2585
common/loongarch/sad-a.S
Normal file
2585
common/loongarch/sad-a.S
Normal file
File diff suppressed because it is too large
Load Diff
1926
common/macroblock.c
Normal file
1926
common/macroblock.c
Normal file
File diff suppressed because it is too large
Load Diff
463
common/macroblock.h
Normal file
463
common/macroblock.h
Normal file
@@ -0,0 +1,463 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* macroblock.h: macroblock common functions
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2005-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
* Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Fiona Glaser <fiona@x264.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_MACROBLOCK_H
|
||||||
|
#define X264_MACROBLOCK_H
|
||||||
|
|
||||||
|
enum macroblock_position_e
|
||||||
|
{
|
||||||
|
MB_LEFT = 0x01,
|
||||||
|
MB_TOP = 0x02,
|
||||||
|
MB_TOPRIGHT = 0x04,
|
||||||
|
MB_TOPLEFT = 0x08,
|
||||||
|
|
||||||
|
MB_PRIVATE = 0x10,
|
||||||
|
|
||||||
|
ALL_NEIGHBORS = 0xf,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint8_t x264_pred_i4x4_neighbors[12] =
|
||||||
|
{
|
||||||
|
MB_TOP, // I_PRED_4x4_V
|
||||||
|
MB_LEFT, // I_PRED_4x4_H
|
||||||
|
MB_LEFT | MB_TOP, // I_PRED_4x4_DC
|
||||||
|
MB_TOP | MB_TOPRIGHT, // I_PRED_4x4_DDL
|
||||||
|
MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_DDR
|
||||||
|
MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_VR
|
||||||
|
MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_HD
|
||||||
|
MB_TOP | MB_TOPRIGHT, // I_PRED_4x4_VL
|
||||||
|
MB_LEFT, // I_PRED_4x4_HU
|
||||||
|
MB_LEFT, // I_PRED_4x4_DC_LEFT
|
||||||
|
MB_TOP, // I_PRED_4x4_DC_TOP
|
||||||
|
0 // I_PRED_4x4_DC_128
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/* XXX mb_type isn't the one written in the bitstream -> only internal usage */
|
||||||
|
#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 || (type) == I_PCM )
|
||||||
|
#define IS_SKIP(type) ( (type) == P_SKIP || (type) == B_SKIP )
|
||||||
|
#define IS_DIRECT(type) ( (type) == B_DIRECT )
|
||||||
|
enum mb_class_e
|
||||||
|
{
|
||||||
|
I_4x4 = 0,
|
||||||
|
I_8x8 = 1,
|
||||||
|
I_16x16 = 2,
|
||||||
|
I_PCM = 3,
|
||||||
|
|
||||||
|
P_L0 = 4,
|
||||||
|
P_8x8 = 5,
|
||||||
|
P_SKIP = 6,
|
||||||
|
|
||||||
|
B_DIRECT = 7,
|
||||||
|
B_L0_L0 = 8,
|
||||||
|
B_L0_L1 = 9,
|
||||||
|
B_L0_BI = 10,
|
||||||
|
B_L1_L0 = 11,
|
||||||
|
B_L1_L1 = 12,
|
||||||
|
B_L1_BI = 13,
|
||||||
|
B_BI_L0 = 14,
|
||||||
|
B_BI_L1 = 15,
|
||||||
|
B_BI_BI = 16,
|
||||||
|
B_8x8 = 17,
|
||||||
|
B_SKIP = 18,
|
||||||
|
|
||||||
|
X264_MBTYPE_MAX = 19
|
||||||
|
};
|
||||||
|
static const uint8_t x264_mb_type_fix[X264_MBTYPE_MAX] =
|
||||||
|
{
|
||||||
|
I_4x4, I_4x4, I_16x16, I_PCM,
|
||||||
|
P_L0, P_8x8, P_SKIP,
|
||||||
|
B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1,
|
||||||
|
B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP
|
||||||
|
};
|
||||||
|
static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] =
|
||||||
|
{
|
||||||
|
{{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, /* INTRA */
|
||||||
|
{{1,1},{0,0}}, /* P_L0 */
|
||||||
|
{{0,0},{0,0}}, /* P_8x8 */
|
||||||
|
{{1,1},{0,0}}, /* P_SKIP */
|
||||||
|
{{0,0},{0,0}}, /* B_DIRECT */
|
||||||
|
{{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}}, /* B_L0_* */
|
||||||
|
{{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}}, /* B_L1_* */
|
||||||
|
{{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}}, /* B_BI_* */
|
||||||
|
{{0,0},{0,0}}, /* B_8x8 */
|
||||||
|
{{0,0},{0,0}} /* B_SKIP */
|
||||||
|
};
|
||||||
|
|
||||||
|
#define IS_SUB4x4(type) ( (type == D_L0_4x4)||(type == D_L1_4x4)||(type == D_BI_4x4) )
|
||||||
|
#define IS_SUB4x8(type) ( (type == D_L0_4x8)||(type == D_L1_4x8)||(type == D_BI_4x8) )
|
||||||
|
#define IS_SUB8x4(type) ( (type == D_L0_8x4)||(type == D_L1_8x4)||(type == D_BI_8x4) )
|
||||||
|
#define IS_SUB8x8(type) ( (type == D_L0_8x8)||(type == D_L1_8x8)||(type == D_BI_8x8)||(type == D_DIRECT_8x8) )
|
||||||
|
enum mb_partition_e
|
||||||
|
{
|
||||||
|
/* sub partition type for P_8x8 and B_8x8 */
|
||||||
|
D_L0_4x4 = 0,
|
||||||
|
D_L0_8x4 = 1,
|
||||||
|
D_L0_4x8 = 2,
|
||||||
|
D_L0_8x8 = 3,
|
||||||
|
|
||||||
|
/* sub partition type for B_8x8 only */
|
||||||
|
D_L1_4x4 = 4,
|
||||||
|
D_L1_8x4 = 5,
|
||||||
|
D_L1_4x8 = 6,
|
||||||
|
D_L1_8x8 = 7,
|
||||||
|
|
||||||
|
D_BI_4x4 = 8,
|
||||||
|
D_BI_8x4 = 9,
|
||||||
|
D_BI_4x8 = 10,
|
||||||
|
D_BI_8x8 = 11,
|
||||||
|
D_DIRECT_8x8 = 12,
|
||||||
|
|
||||||
|
/* partition */
|
||||||
|
D_8x8 = 13,
|
||||||
|
D_16x8 = 14,
|
||||||
|
D_8x16 = 15,
|
||||||
|
D_16x16 = 16,
|
||||||
|
X264_PARTTYPE_MAX = 17,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint8_t x264_mb_partition_listX_table[2][17] =
|
||||||
|
{{
|
||||||
|
1, 1, 1, 1, /* D_L0_* */
|
||||||
|
0, 0, 0, 0, /* D_L1_* */
|
||||||
|
1, 1, 1, 1, /* D_BI_* */
|
||||||
|
0, /* D_DIRECT_8x8 */
|
||||||
|
0, 0, 0, 0 /* 8x8 .. 16x16 */
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0, 0, 0, 0, /* D_L0_* */
|
||||||
|
1, 1, 1, 1, /* D_L1_* */
|
||||||
|
1, 1, 1, 1, /* D_BI_* */
|
||||||
|
0, /* D_DIRECT_8x8 */
|
||||||
|
0, 0, 0, 0 /* 8x8 .. 16x16 */
|
||||||
|
}};
|
||||||
|
static const uint8_t x264_mb_partition_count_table[17] =
|
||||||
|
{
|
||||||
|
/* sub L0 */
|
||||||
|
4, 2, 2, 1,
|
||||||
|
/* sub L1 */
|
||||||
|
4, 2, 2, 1,
|
||||||
|
/* sub BI */
|
||||||
|
4, 2, 2, 1,
|
||||||
|
/* Direct */
|
||||||
|
1,
|
||||||
|
/* Partition */
|
||||||
|
4, 2, 2, 1
|
||||||
|
};
|
||||||
|
static const uint8_t x264_mb_partition_pixel_table[17] =
|
||||||
|
{
|
||||||
|
PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_L0_* */
|
||||||
|
PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_L1_* */
|
||||||
|
PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_BI_* */
|
||||||
|
PIXEL_8x8, /* D_DIRECT_8x8 */
|
||||||
|
PIXEL_8x8, PIXEL_16x8, PIXEL_8x16, PIXEL_16x16, /* 8x8 .. 16x16 */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* zigzags are transposed with respect to the tables in the standard */
|
||||||
|
static const uint8_t x264_zigzag_scan4[2][16] =
|
||||||
|
{{ // frame
|
||||||
|
0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15
|
||||||
|
},
|
||||||
|
{ // field
|
||||||
|
0, 1, 4, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||||
|
}};
|
||||||
|
static const uint8_t x264_zigzag_scan8[2][64] =
|
||||||
|
{{
|
||||||
|
0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40,
|
||||||
|
33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
|
||||||
|
28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
|
||||||
|
23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
|
||||||
|
},
|
||||||
|
{
|
||||||
|
0, 1, 2, 8, 9, 3, 4, 10, 16, 11, 5, 6, 7, 12, 17, 24,
|
||||||
|
18, 13, 14, 15, 19, 25, 32, 26, 20, 21, 22, 23, 27, 33, 40, 34,
|
||||||
|
28, 29, 30, 31, 35, 41, 48, 42, 36, 37, 38, 39, 43, 49, 50, 44,
|
||||||
|
45, 46, 47, 51, 56, 57, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63
|
||||||
|
}};
|
||||||
|
|
||||||
|
static const uint8_t block_idx_x[16] =
|
||||||
|
{
|
||||||
|
0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
|
||||||
|
};
|
||||||
|
static const uint8_t block_idx_y[16] =
|
||||||
|
{
|
||||||
|
0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
|
||||||
|
};
|
||||||
|
static const uint8_t block_idx_xy[4][4] =
|
||||||
|
{
|
||||||
|
{ 0, 2, 8, 10 },
|
||||||
|
{ 1, 3, 9, 11 },
|
||||||
|
{ 4, 6, 12, 14 },
|
||||||
|
{ 5, 7, 13, 15 }
|
||||||
|
};
|
||||||
|
static const uint8_t block_idx_xy_1d[16] =
|
||||||
|
{
|
||||||
|
0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
|
||||||
|
};
|
||||||
|
static const uint8_t block_idx_yx_1d[16] =
|
||||||
|
{
|
||||||
|
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
|
||||||
|
};
|
||||||
|
static const uint8_t block_idx_xy_fenc[16] =
|
||||||
|
{
|
||||||
|
0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
|
||||||
|
0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
|
||||||
|
2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
|
||||||
|
2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
|
||||||
|
0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
|
||||||
|
0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
|
||||||
|
2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
|
||||||
|
2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
|
||||||
|
};
|
||||||
|
static const uint16_t block_idx_xy_fdec[16] =
|
||||||
|
{
|
||||||
|
0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
|
||||||
|
0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
|
||||||
|
2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
|
||||||
|
2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
|
||||||
|
0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
|
||||||
|
0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
|
||||||
|
2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
|
||||||
|
2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
|
||||||
|
};
|
||||||
|
|
||||||
|
#define QP(qP) ( (qP)+QP_BD_OFFSET )
|
||||||
|
static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
|
||||||
|
{
|
||||||
|
0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0,
|
||||||
|
#if BIT_DEPTH > 9
|
||||||
|
QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
|
||||||
|
#endif
|
||||||
|
#if BIT_DEPTH > 8
|
||||||
|
QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
|
||||||
|
#endif
|
||||||
|
QP(0), QP(1), QP(2), QP(3), QP(4), QP(5),
|
||||||
|
QP(6), QP(7), QP(8), QP(9), QP(10), QP(11),
|
||||||
|
QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
|
||||||
|
QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
|
||||||
|
QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
|
||||||
|
QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
|
||||||
|
QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
|
||||||
|
QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
|
||||||
|
QP(39), QP(39), QP(39), QP(39),
|
||||||
|
QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
|
||||||
|
QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
|
||||||
|
};
|
||||||
|
#undef QP
|
||||||
|
|
||||||
|
enum cabac_ctx_block_cat_e
|
||||||
|
{
|
||||||
|
DCT_LUMA_DC = 0,
|
||||||
|
DCT_LUMA_AC = 1,
|
||||||
|
DCT_LUMA_4x4 = 2,
|
||||||
|
DCT_CHROMA_DC = 3,
|
||||||
|
DCT_CHROMA_AC = 4,
|
||||||
|
DCT_LUMA_8x8 = 5,
|
||||||
|
DCT_CHROMAU_DC = 6,
|
||||||
|
DCT_CHROMAU_AC = 7,
|
||||||
|
DCT_CHROMAU_4x4 = 8,
|
||||||
|
DCT_CHROMAU_8x8 = 9,
|
||||||
|
DCT_CHROMAV_DC = 10,
|
||||||
|
DCT_CHROMAV_AC = 11,
|
||||||
|
DCT_CHROMAV_4x4 = 12,
|
||||||
|
DCT_CHROMAV_8x8 = 13,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint8_t ctx_cat_plane[6][3] =
|
||||||
|
{
|
||||||
|
{ DCT_LUMA_DC, DCT_CHROMAU_DC, DCT_CHROMAV_DC},
|
||||||
|
{ DCT_LUMA_AC, DCT_CHROMAU_AC, DCT_CHROMAV_AC},
|
||||||
|
{DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4},
|
||||||
|
{0},
|
||||||
|
{0},
|
||||||
|
{DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
|
||||||
|
#define x264_macroblock_cache_allocate x264_template(macroblock_cache_allocate)
|
||||||
|
int x264_macroblock_cache_allocate( x264_t *h );
|
||||||
|
#define x264_macroblock_cache_free x264_template(macroblock_cache_free)
|
||||||
|
void x264_macroblock_cache_free( x264_t *h );
|
||||||
|
|
||||||
|
/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
|
||||||
|
#define x264_macroblock_thread_allocate x264_template(macroblock_thread_allocate)
|
||||||
|
int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
|
||||||
|
#define x264_macroblock_thread_free x264_template(macroblock_thread_free)
|
||||||
|
void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
|
||||||
|
|
||||||
|
#define x264_macroblock_slice_init x264_template(macroblock_slice_init)
|
||||||
|
void x264_macroblock_slice_init( x264_t *h );
|
||||||
|
#define x264_macroblock_thread_init x264_template(macroblock_thread_init)
|
||||||
|
void x264_macroblock_thread_init( x264_t *h );
|
||||||
|
#define x264_macroblock_cache_load_interlaced x264_template(macroblock_cache_load_interlaced)
|
||||||
|
void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y );
|
||||||
|
#define x264_macroblock_cache_load_progressive x264_template(macroblock_cache_load_progressive)
|
||||||
|
void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y );
|
||||||
|
#define x264_macroblock_deblock_strength x264_template(macroblock_deblock_strength)
|
||||||
|
void x264_macroblock_deblock_strength( x264_t *h );
|
||||||
|
#define x264_macroblock_cache_save x264_template(macroblock_cache_save)
|
||||||
|
void x264_macroblock_cache_save( x264_t *h );
|
||||||
|
|
||||||
|
#define x264_macroblock_bipred_init x264_template(macroblock_bipred_init)
|
||||||
|
void x264_macroblock_bipred_init( x264_t *h );
|
||||||
|
|
||||||
|
#define x264_prefetch_fenc x264_template(prefetch_fenc)
|
||||||
|
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
|
||||||
|
|
||||||
|
#define x264_copy_column8 x264_template(copy_column8)
|
||||||
|
void x264_copy_column8( pixel *dst, pixel *src );
|
||||||
|
|
||||||
|
/* x264_mb_predict_mv_16x16:
|
||||||
|
* set mvp with predicted mv for D_16x16 block
|
||||||
|
* h->mb. need only valid values from other blocks */
|
||||||
|
#define x264_mb_predict_mv_16x16 x264_template(mb_predict_mv_16x16)
|
||||||
|
void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] );
|
||||||
|
/* x264_mb_predict_mv_pskip:
|
||||||
|
* set mvp with predicted mv for P_SKIP
|
||||||
|
* h->mb. need only valid values from other blocks */
|
||||||
|
#define x264_mb_predict_mv_pskip x264_template(mb_predict_mv_pskip)
|
||||||
|
void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] );
|
||||||
|
/* x264_mb_predict_mv:
|
||||||
|
* set mvp with predicted mv for all blocks except SKIP and DIRECT
|
||||||
|
* h->mb. need valid ref/partition/sub of current block to be valid
|
||||||
|
* and valid mv/ref from other blocks. */
|
||||||
|
#define x264_mb_predict_mv x264_template(mb_predict_mv)
|
||||||
|
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] );
|
||||||
|
/* x264_mb_predict_mv_direct16x16:
|
||||||
|
* set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT
|
||||||
|
* h->mb. need only valid values from other blocks.
|
||||||
|
* return 1 on success, 0 on failure.
|
||||||
|
* if b_changed != NULL, set it to whether refs or mvs differ from
|
||||||
|
* before this functioncall. */
|
||||||
|
#define x264_mb_predict_mv_direct16x16 x264_template(mb_predict_mv_direct16x16)
|
||||||
|
int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
|
||||||
|
/* x264_mb_predict_mv_ref16x16:
|
||||||
|
* set mvc with D_16x16 prediction.
|
||||||
|
* uses all neighbors, even those that didn't end up using this ref.
|
||||||
|
* h->mb. need only valid values from other blocks */
|
||||||
|
#define x264_mb_predict_mv_ref16x16 x264_template(mb_predict_mv_ref16x16)
|
||||||
|
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc );
|
||||||
|
|
||||||
|
#define x264_mb_mc x264_template(mb_mc)
|
||||||
|
void x264_mb_mc( x264_t *h );
|
||||||
|
#define x264_mb_mc_8x8 x264_template(mb_mc_8x8)
|
||||||
|
void x264_mb_mc_8x8( x264_t *h, int i8 );
|
||||||
|
|
||||||
|
static ALWAYS_INLINE uint32_t pack16to32( uint32_t a, uint32_t b )
|
||||||
|
{
|
||||||
|
#if WORDS_BIGENDIAN
|
||||||
|
return b + (a<<16);
|
||||||
|
#else
|
||||||
|
return a + (b<<16);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
static ALWAYS_INLINE uint32_t pack8to16( uint32_t a, uint32_t b )
|
||||||
|
{
|
||||||
|
#if WORDS_BIGENDIAN
|
||||||
|
return b + (a<<8);
|
||||||
|
#else
|
||||||
|
return a + (b<<8);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
static ALWAYS_INLINE uint32_t pack8to32( uint32_t a, uint32_t b, uint32_t c, uint32_t d )
|
||||||
|
{
|
||||||
|
#if WORDS_BIGENDIAN
|
||||||
|
return d + (c<<8) + (b<<16) + (a<<24);
|
||||||
|
#else
|
||||||
|
return a + (b<<8) + (c<<16) + (d<<24);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
|
||||||
|
{
|
||||||
|
#if WORDS_BIGENDIAN
|
||||||
|
return (b&0xFFFF) + ((uint32_t)a<<16);
|
||||||
|
#else
|
||||||
|
return (a&0xFFFF) + ((uint32_t)b<<16);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
|
||||||
|
{
|
||||||
|
#if WORDS_BIGENDIAN
|
||||||
|
return b + ((uint64_t)a<<32);
|
||||||
|
#else
|
||||||
|
return a + ((uint64_t)b<<32);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HIGH_BIT_DEPTH
|
||||||
|
# define pack_pixel_1to2 pack16to32
|
||||||
|
# define pack_pixel_2to4 pack32to64
|
||||||
|
#else
|
||||||
|
# define pack_pixel_1to2 pack8to16
|
||||||
|
# define pack_pixel_2to4 pack16to32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
|
||||||
|
{
|
||||||
|
const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
|
||||||
|
const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
|
||||||
|
const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
|
||||||
|
x264_mb_pred_mode4x4_fix(mb) );
|
||||||
|
|
||||||
|
if( m < 0 )
|
||||||
|
return I_PRED_4x4_DC;
|
||||||
|
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
static ALWAYS_INLINE int x264_mb_predict_non_zero_code( x264_t *h, int idx )
|
||||||
|
{
|
||||||
|
const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
|
||||||
|
const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
|
||||||
|
|
||||||
|
int i_ret = za + zb;
|
||||||
|
|
||||||
|
if( i_ret < 0x80 )
|
||||||
|
i_ret = ( i_ret + 1 ) >> 1;
|
||||||
|
return i_ret & 0x7f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* intra and skip are disallowed, p8x8 is conditional. */
|
||||||
|
static const uint8_t x264_transform_allowed[X264_MBTYPE_MAX] =
|
||||||
|
{
|
||||||
|
0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0
|
||||||
|
};
|
||||||
|
|
||||||
|
/* x264_mb_transform_8x8_allowed:
|
||||||
|
* check whether any partition is smaller than 8x8 (or at least
|
||||||
|
* might be, according to just partition type.)
|
||||||
|
* doesn't check for cbp */
|
||||||
|
static ALWAYS_INLINE int x264_mb_transform_8x8_allowed( x264_t *h )
|
||||||
|
{
|
||||||
|
if( !h->pps->b_transform_8x8_mode )
|
||||||
|
return 0;
|
||||||
|
if( h->mb.i_type != P_8x8 )
|
||||||
|
return x264_transform_allowed[h->mb.i_type];
|
||||||
|
return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
784
common/mc.c
Normal file
784
common/mc.c
Normal file
@@ -0,0 +1,784 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc.c: motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2003-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||||
|
* Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#if HAVE_MMX
|
||||||
|
#include "x86/mc.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_ALTIVEC
|
||||||
|
#include "ppc/mc.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
#include "arm/mc.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_AARCH64
|
||||||
|
#include "aarch64/mc.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_MSA
|
||||||
|
#include "mips/mc.h"
|
||||||
|
#endif
|
||||||
|
#if HAVE_LSX
|
||||||
|
# include "loongarch/mc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride,
|
||||||
|
pixel *src1, intptr_t i_src1_stride,
|
||||||
|
pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
|
||||||
|
{
|
||||||
|
for( int y = 0; y < i_height; y++ )
|
||||||
|
{
|
||||||
|
for( int x = 0; x < i_width; x++ )
|
||||||
|
dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
|
||||||
|
dst += i_dst_stride;
|
||||||
|
src1 += i_src1_stride;
|
||||||
|
src2 += i_src2_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void pixel_avg_wxh( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src1, intptr_t i_src1,
|
||||||
|
pixel *src2, intptr_t i_src2, int width, int height )
|
||||||
|
{
|
||||||
|
for( int y = 0; y < height; y++ )
|
||||||
|
{
|
||||||
|
for( int x = 0; x < width; x++ )
|
||||||
|
dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
|
||||||
|
src1 += i_src1;
|
||||||
|
src2 += i_src2;
|
||||||
|
dst += i_dst;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Implicit weighted bipred only:
|
||||||
|
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
|
||||||
|
static inline void pixel_avg_weight_wxh( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src1, intptr_t i_src1,
|
||||||
|
pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
|
||||||
|
{
|
||||||
|
int i_weight2 = 64 - i_weight1;
|
||||||
|
for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
|
||||||
|
for( int x = 0; x<width; x++ )
|
||||||
|
dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
|
||||||
|
}
|
||||||
|
#undef op_scale2
|
||||||
|
|
||||||
|
#define PIXEL_AVG_C( name, width, height ) \
|
||||||
|
static void name( pixel *pix1, intptr_t i_stride_pix1, \
|
||||||
|
pixel *pix2, intptr_t i_stride_pix2, \
|
||||||
|
pixel *pix3, intptr_t i_stride_pix3, int weight ) \
|
||||||
|
{ \
|
||||||
|
if( weight == 32 ) \
|
||||||
|
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
|
||||||
|
else \
|
||||||
|
pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
|
||||||
|
}
|
||||||
|
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_4x16, 4, 16 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_2x8, 2, 8 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
|
||||||
|
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
|
||||||
|
|
||||||
|
static void weight_cache( x264_t *h, x264_weight_t *w )
|
||||||
|
{
|
||||||
|
w->weightfn = h->mc.weight;
|
||||||
|
}
|
||||||
|
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
|
||||||
|
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
|
||||||
|
static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
|
||||||
|
const x264_weight_t *weight, int i_width, int i_height )
|
||||||
|
{
|
||||||
|
int offset = weight->i_offset * (1 << (BIT_DEPTH-8));
|
||||||
|
int scale = weight->i_scale;
|
||||||
|
int denom = weight->i_denom;
|
||||||
|
if( denom >= 1 )
|
||||||
|
{
|
||||||
|
for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
|
||||||
|
for( int x = 0; x < i_width; x++ )
|
||||||
|
opscale( x );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
|
||||||
|
for( int x = 0; x < i_width; x++ )
|
||||||
|
opscale_noden( x );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define MC_WEIGHT_C( name, width ) \
|
||||||
|
static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
|
||||||
|
{ \
|
||||||
|
mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
|
||||||
|
}
|
||||||
|
|
||||||
|
MC_WEIGHT_C( mc_weight_w20, 20 )
|
||||||
|
MC_WEIGHT_C( mc_weight_w16, 16 )
|
||||||
|
MC_WEIGHT_C( mc_weight_w12, 12 )
|
||||||
|
MC_WEIGHT_C( mc_weight_w8, 8 )
|
||||||
|
MC_WEIGHT_C( mc_weight_w4, 4 )
|
||||||
|
MC_WEIGHT_C( mc_weight_w2, 2 )
|
||||||
|
|
||||||
|
static weight_fn_t mc_weight_wtab[6] =
|
||||||
|
{
|
||||||
|
mc_weight_w2,
|
||||||
|
mc_weight_w4,
|
||||||
|
mc_weight_w8,
|
||||||
|
mc_weight_w12,
|
||||||
|
mc_weight_w16,
|
||||||
|
mc_weight_w20,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
|
||||||
|
{
|
||||||
|
for( int y = 0; y < i_height; y++ )
|
||||||
|
{
|
||||||
|
memcpy( dst, src, i_width * SIZEOF_PIXEL );
|
||||||
|
|
||||||
|
src += i_src_stride;
|
||||||
|
dst += i_dst_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
|
||||||
|
static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
|
||||||
|
intptr_t stride, int width, int height, int16_t *buf )
|
||||||
|
{
|
||||||
|
const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
|
||||||
|
for( int y = 0; y < height; y++ )
|
||||||
|
{
|
||||||
|
for( int x = -2; x < width+3; x++ )
|
||||||
|
{
|
||||||
|
int v = TAPFILTER(src,stride);
|
||||||
|
dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
|
||||||
|
/* transform v for storage in a 16-bit integer */
|
||||||
|
buf[x+2] = v + pad;
|
||||||
|
}
|
||||||
|
for( int x = 0; x < width; x++ )
|
||||||
|
dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
|
||||||
|
for( int x = 0; x < width; x++ )
|
||||||
|
dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
|
||||||
|
dsth += stride;
|
||||||
|
dstv += stride;
|
||||||
|
dstc += stride;
|
||||||
|
src += stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mc_luma( pixel *dst, intptr_t i_dst_stride,
|
||||||
|
pixel *src[4], intptr_t i_src_stride,
|
||||||
|
int mvx, int mvy,
|
||||||
|
int i_width, int i_height, const x264_weight_t *weight )
|
||||||
|
{
|
||||||
|
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||||
|
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||||
|
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
|
||||||
|
|
||||||
|
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||||
|
{
|
||||||
|
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||||
|
pixel_avg( dst, i_dst_stride, src1, i_src_stride,
|
||||||
|
src2, i_src_stride, i_width, i_height );
|
||||||
|
if( weight->weightfn )
|
||||||
|
mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
|
||||||
|
}
|
||||||
|
else if( weight->weightfn )
|
||||||
|
mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
|
||||||
|
else
|
||||||
|
mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
|
||||||
|
}
|
||||||
|
|
||||||
|
static pixel *get_ref( pixel *dst, intptr_t *i_dst_stride,
|
||||||
|
pixel *src[4], intptr_t i_src_stride,
|
||||||
|
int mvx, int mvy,
|
||||||
|
int i_width, int i_height, const x264_weight_t *weight )
|
||||||
|
{
|
||||||
|
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||||
|
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||||
|
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
|
||||||
|
|
||||||
|
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||||
|
{
|
||||||
|
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||||
|
pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
|
||||||
|
src2, i_src_stride, i_width, i_height );
|
||||||
|
if( weight->weightfn )
|
||||||
|
mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
else if( weight->weightfn )
|
||||||
|
{
|
||||||
|
mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*i_dst_stride = i_src_stride;
|
||||||
|
return src1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* full chroma mc (ie until 1/8 pixel)*/
|
||||||
|
static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
|
||||||
|
pixel *src, intptr_t i_src_stride,
|
||||||
|
int mvx, int mvy,
|
||||||
|
int i_width, int i_height )
|
||||||
|
{
|
||||||
|
pixel *srcp;
|
||||||
|
|
||||||
|
int d8x = mvx&0x07;
|
||||||
|
int d8y = mvy&0x07;
|
||||||
|
int cA = (8-d8x)*(8-d8y);
|
||||||
|
int cB = d8x *(8-d8y);
|
||||||
|
int cC = (8-d8x)*d8y;
|
||||||
|
int cD = d8x *d8y;
|
||||||
|
|
||||||
|
src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
|
||||||
|
srcp = &src[i_src_stride];
|
||||||
|
|
||||||
|
for( int y = 0; y < i_height; y++ )
|
||||||
|
{
|
||||||
|
for( int x = 0; x < i_width; x++ )
|
||||||
|
{
|
||||||
|
dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] +
|
||||||
|
cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
|
||||||
|
dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] +
|
||||||
|
cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
|
||||||
|
}
|
||||||
|
dstu += i_dst_stride;
|
||||||
|
dstv += i_dst_stride;
|
||||||
|
src = srcp;
|
||||||
|
srcp += i_src_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define MC_COPY(W) \
|
||||||
|
static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
|
||||||
|
{ \
|
||||||
|
mc_copy( src, i_src, dst, i_dst, W, i_height ); \
|
||||||
|
}
|
||||||
|
MC_COPY( 16 )
|
||||||
|
MC_COPY( 8 )
|
||||||
|
MC_COPY( 4 )
|
||||||
|
|
||||||
|
void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h )
|
||||||
|
{
|
||||||
|
while( h-- )
|
||||||
|
{
|
||||||
|
memcpy( dst, src, w * SIZEOF_PIXEL );
|
||||||
|
dst += i_dst;
|
||||||
|
src += i_src;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h )
|
||||||
|
{
|
||||||
|
for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
|
||||||
|
for( int x=0; x<2*w; x+=2 )
|
||||||
|
{
|
||||||
|
dst[x] = src[x+1];
|
||||||
|
dst[x+1] = src[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *srcu, intptr_t i_srcu,
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h )
|
||||||
|
{
|
||||||
|
for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
|
||||||
|
for( int x=0; x<w; x++ )
|
||||||
|
{
|
||||||
|
dst[2*x] = srcu[x];
|
||||||
|
dst[2*x+1] = srcv[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h )
|
||||||
|
{
|
||||||
|
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, src+=i_src )
|
||||||
|
for( int x=0; x<w; x++ )
|
||||||
|
{
|
||||||
|
dsta[x] = src[2*x];
|
||||||
|
dstb[x] = src[2*x+1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
|
||||||
|
pixel *dstb, intptr_t i_dstb,
|
||||||
|
pixel *dstc, intptr_t i_dstc,
|
||||||
|
pixel *src, intptr_t i_src, int pw, int w, int h )
|
||||||
|
{
|
||||||
|
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
|
||||||
|
{
|
||||||
|
for( int x=0; x<w; x++ )
|
||||||
|
{
|
||||||
|
dsta[x] = src[x*pw];
|
||||||
|
dstb[x] = src[x*pw+1];
|
||||||
|
dstc[x] = src[x*pw+2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if WORDS_BIGENDIAN
|
||||||
|
static ALWAYS_INLINE uint32_t v210_endian_fix32( uint32_t x )
|
||||||
|
{
|
||||||
|
return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define v210_endian_fix32(x) (x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
|
||||||
|
pixel *dstc, intptr_t i_dstc,
|
||||||
|
uint32_t *src, intptr_t i_src, int w, int h )
|
||||||
|
{
|
||||||
|
for( int l = 0; l < h; l++ )
|
||||||
|
{
|
||||||
|
pixel *dsty0 = dsty;
|
||||||
|
pixel *dstc0 = dstc;
|
||||||
|
uint32_t *src0 = src;
|
||||||
|
|
||||||
|
for( int n = 0; n < w; n += 3 )
|
||||||
|
{
|
||||||
|
uint32_t s = v210_endian_fix32( *src0++ );
|
||||||
|
*dstc0++ = s & 0x03FF;
|
||||||
|
*dsty0++ = (s >> 10) & 0x03FF;
|
||||||
|
*dstc0++ = (s >> 20) & 0x03FF;
|
||||||
|
s = v210_endian_fix32( *src0++ );
|
||||||
|
*dsty0++ = s & 0x03FF;
|
||||||
|
*dstc0++ = (s >> 10) & 0x03FF;
|
||||||
|
*dsty0++ = (s >> 20) & 0x03FF;
|
||||||
|
}
|
||||||
|
|
||||||
|
dsty += i_dsty;
|
||||||
|
dstc += i_dstc;
|
||||||
|
src += i_src;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
|
||||||
|
{
|
||||||
|
for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
|
||||||
|
for( int x=0; x<8; x++ )
|
||||||
|
{
|
||||||
|
dst[2*x] = srcu[x];
|
||||||
|
dst[2*x+1] = srcv[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
|
||||||
|
{
|
||||||
|
x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
|
||||||
|
{
|
||||||
|
x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void prefetch_fenc_null( pixel *pix_y, intptr_t stride_y,
|
||||||
|
pixel *pix_uv, intptr_t stride_uv, int mb_x )
|
||||||
|
{}
|
||||||
|
|
||||||
|
static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
|
||||||
|
{}
|
||||||
|
|
||||||
|
static void memzero_aligned( void * dst, size_t n )
|
||||||
|
{
|
||||||
|
memset( dst, 0, n );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
|
||||||
|
{
|
||||||
|
int v = pix[0]+pix[1]+pix[2]+pix[3];
|
||||||
|
for( int x = 0; x < stride-4; x++ )
|
||||||
|
{
|
||||||
|
sum[x] = (uint16_t)(v + sum[x-stride]);
|
||||||
|
v += pix[x+4] - pix[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
|
||||||
|
{
|
||||||
|
int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
|
||||||
|
for( int x = 0; x < stride-8; x++ )
|
||||||
|
{
|
||||||
|
sum[x] = (uint16_t)(v + sum[x-stride]);
|
||||||
|
v += pix[x+8] - pix[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
|
||||||
|
{
|
||||||
|
for( int x = 0; x < stride-8; x++ )
|
||||||
|
sum4[x] = (uint16_t)(sum8[x+4*stride] - sum8[x]);
|
||||||
|
for( int x = 0; x < stride-8; x++ )
|
||||||
|
sum8[x] = (uint16_t)(sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void integral_init8v( uint16_t *sum8, intptr_t stride )
|
||||||
|
{
|
||||||
|
for( int x = 0; x < stride-8; x++ )
|
||||||
|
sum8[x] = (uint16_t)(sum8[x+8*stride] - sum8[x]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
|
||||||
|
{
|
||||||
|
pixel *src = frame->plane[0];
|
||||||
|
int i_stride = frame->i_stride[0];
|
||||||
|
int i_height = frame->i_lines[0];
|
||||||
|
int i_width = frame->i_width[0];
|
||||||
|
|
||||||
|
// duplicate last row and column so that their interpolation doesn't have to be special-cased
|
||||||
|
for( int y = 0; y < i_height; y++ )
|
||||||
|
src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
|
||||||
|
memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * SIZEOF_PIXEL );
|
||||||
|
h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
|
||||||
|
i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
|
||||||
|
x264_frame_expand_border_lowres( frame );
|
||||||
|
|
||||||
|
memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );
|
||||||
|
|
||||||
|
for( int y = 0; y < h->param.i_bframe + 2; y++ )
|
||||||
|
for( int x = 0; x < h->param.i_bframe + 2; x++ )
|
||||||
|
frame->i_row_satds[y][x][0] = -1;
|
||||||
|
|
||||||
|
for( int y = 0; y <= !!h->param.i_bframe; y++ )
|
||||||
|
for( int x = 0; x <= h->param.i_bframe; x++ )
|
||||||
|
frame->lowres_mvs[y][x][0][0] = 0x7FFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
|
||||||
|
intptr_t src_stride, intptr_t dst_stride, int width, int height )
|
||||||
|
{
|
||||||
|
for( int y = 0; y < height; y++ )
|
||||||
|
{
|
||||||
|
pixel *src1 = src0+src_stride;
|
||||||
|
pixel *src2 = src1+src_stride;
|
||||||
|
for( int x = 0; x<width; x++ )
|
||||||
|
{
|
||||||
|
// slower than naive bilinear, but matches asm
|
||||||
|
#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
|
||||||
|
dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1], src1[2*x+1]);
|
||||||
|
dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
|
||||||
|
dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1], src2[2*x+1]);
|
||||||
|
dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
|
||||||
|
#undef FILTER
|
||||||
|
}
|
||||||
|
src0 += src_stride*2;
|
||||||
|
dst0 += dst_stride;
|
||||||
|
dsth += dst_stride;
|
||||||
|
dstv += dst_stride;
|
||||||
|
dstc += dst_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Estimate the total amount of influence on future quality that could be had if we
|
||||||
|
* were to improve the reference samples used to inter predict any given macroblock. */
|
||||||
|
static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
|
||||||
|
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
|
||||||
|
{
|
||||||
|
float fps = *fps_factor;
|
||||||
|
for( int i = 0; i < len; i++ )
|
||||||
|
{
|
||||||
|
int intra_cost = intra_costs[i];
|
||||||
|
int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
|
||||||
|
float propagate_intra = intra_cost * inv_qscales[i];
|
||||||
|
float propagate_amount = propagate_in[i] + propagate_intra*fps;
|
||||||
|
float propagate_num = intra_cost - inter_cost;
|
||||||
|
float propagate_denom = intra_cost;
|
||||||
|
dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
|
||||||
|
int16_t *propagate_amount, uint16_t *lowres_costs,
|
||||||
|
int bipred_weight, int mb_y, int len, int list )
|
||||||
|
{
|
||||||
|
unsigned stride = h->mb.i_mb_stride;
|
||||||
|
unsigned width = h->mb.i_mb_width;
|
||||||
|
unsigned height = h->mb.i_mb_height;
|
||||||
|
|
||||||
|
for( int i = 0; i < len; i++ )
|
||||||
|
{
|
||||||
|
int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
|
||||||
|
|
||||||
|
if( !(lists_used & (1 << list)) )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int listamount = propagate_amount[i];
|
||||||
|
/* Apply bipred weighting. */
|
||||||
|
if( lists_used == 3 )
|
||||||
|
listamount = (listamount * bipred_weight + 32) >> 6;
|
||||||
|
|
||||||
|
/* Early termination for simple case of mv0. */
|
||||||
|
if( !M32( mvs[i] ) )
|
||||||
|
{
|
||||||
|
MC_CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int x = mvs[i][0];
|
||||||
|
int y = mvs[i][1];
|
||||||
|
unsigned mbx = (unsigned)((x>>5)+i);
|
||||||
|
unsigned mby = (unsigned)((y>>5)+mb_y);
|
||||||
|
unsigned idx0 = mbx + mby * stride;
|
||||||
|
unsigned idx2 = idx0 + stride;
|
||||||
|
x &= 31;
|
||||||
|
y &= 31;
|
||||||
|
int idx0weight = (32-y)*(32-x);
|
||||||
|
int idx1weight = (32-y)*x;
|
||||||
|
int idx2weight = y*(32-x);
|
||||||
|
int idx3weight = y*x;
|
||||||
|
idx0weight = (idx0weight * listamount + 512) >> 10;
|
||||||
|
idx1weight = (idx1weight * listamount + 512) >> 10;
|
||||||
|
idx2weight = (idx2weight * listamount + 512) >> 10;
|
||||||
|
idx3weight = (idx3weight * listamount + 512) >> 10;
|
||||||
|
|
||||||
|
if( mbx < width-1 && mby < height-1 )
|
||||||
|
{
|
||||||
|
MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
|
||||||
|
MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
|
||||||
|
MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
|
||||||
|
MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Note: this takes advantage of unsigned representation to
|
||||||
|
* catch negative mbx/mby. */
|
||||||
|
if( mby < height )
|
||||||
|
{
|
||||||
|
if( mbx < width )
|
||||||
|
MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
|
||||||
|
if( mbx+1 < width )
|
||||||
|
MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
|
||||||
|
}
|
||||||
|
if( mby+1 < height )
|
||||||
|
{
|
||||||
|
if( mbx < width )
|
||||||
|
MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
|
||||||
|
if( mbx+1 < width )
|
||||||
|
MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Conversion between float and Q8.8 fixed point (big-endian) for storage */
|
||||||
|
static void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < count; i++ )
|
||||||
|
dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
|
||||||
|
{
|
||||||
|
for( int i = 0; i < count; i++ )
|
||||||
|
dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent )
|
||||||
|
{
|
||||||
|
pf->mc_luma = mc_luma;
|
||||||
|
pf->get_ref = get_ref;
|
||||||
|
|
||||||
|
pf->mc_chroma = mc_chroma;
|
||||||
|
|
||||||
|
pf->avg[PIXEL_16x16]= pixel_avg_16x16;
|
||||||
|
pf->avg[PIXEL_16x8] = pixel_avg_16x8;
|
||||||
|
pf->avg[PIXEL_8x16] = pixel_avg_8x16;
|
||||||
|
pf->avg[PIXEL_8x8] = pixel_avg_8x8;
|
||||||
|
pf->avg[PIXEL_8x4] = pixel_avg_8x4;
|
||||||
|
pf->avg[PIXEL_4x16] = pixel_avg_4x16;
|
||||||
|
pf->avg[PIXEL_4x8] = pixel_avg_4x8;
|
||||||
|
pf->avg[PIXEL_4x4] = pixel_avg_4x4;
|
||||||
|
pf->avg[PIXEL_4x2] = pixel_avg_4x2;
|
||||||
|
pf->avg[PIXEL_2x8] = pixel_avg_2x8;
|
||||||
|
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
|
||||||
|
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
|
||||||
|
|
||||||
|
pf->weight = mc_weight_wtab;
|
||||||
|
pf->offsetadd = mc_weight_wtab;
|
||||||
|
pf->offsetsub = mc_weight_wtab;
|
||||||
|
pf->weight_cache = weight_cache;
|
||||||
|
|
||||||
|
pf->copy_16x16_unaligned = mc_copy_w16;
|
||||||
|
pf->copy[PIXEL_16x16] = mc_copy_w16;
|
||||||
|
pf->copy[PIXEL_8x8] = mc_copy_w8;
|
||||||
|
pf->copy[PIXEL_4x4] = mc_copy_w4;
|
||||||
|
|
||||||
|
pf->store_interleave_chroma = store_interleave_chroma;
|
||||||
|
pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
|
||||||
|
pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
|
||||||
|
|
||||||
|
pf->plane_copy = x264_plane_copy_c;
|
||||||
|
pf->plane_copy_swap = x264_plane_copy_swap_c;
|
||||||
|
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
|
||||||
|
|
||||||
|
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
|
||||||
|
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c;
|
||||||
|
pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_c;
|
||||||
|
pf->plane_copy_deinterleave_v210 = plane_copy_deinterleave_v210_c;
|
||||||
|
|
||||||
|
pf->hpel_filter = hpel_filter;
|
||||||
|
|
||||||
|
pf->prefetch_fenc_400 = prefetch_fenc_null;
|
||||||
|
pf->prefetch_fenc_420 = prefetch_fenc_null;
|
||||||
|
pf->prefetch_fenc_422 = prefetch_fenc_null;
|
||||||
|
pf->prefetch_ref = prefetch_ref_null;
|
||||||
|
pf->memcpy_aligned = memcpy;
|
||||||
|
pf->memzero_aligned = memzero_aligned;
|
||||||
|
pf->frame_init_lowres_core = frame_init_lowres_core;
|
||||||
|
|
||||||
|
pf->integral_init4h = integral_init4h;
|
||||||
|
pf->integral_init8h = integral_init8h;
|
||||||
|
pf->integral_init4v = integral_init4v;
|
||||||
|
pf->integral_init8v = integral_init8v;
|
||||||
|
|
||||||
|
pf->mbtree_propagate_cost = mbtree_propagate_cost;
|
||||||
|
pf->mbtree_propagate_list = mbtree_propagate_list;
|
||||||
|
pf->mbtree_fix8_pack = mbtree_fix8_pack;
|
||||||
|
pf->mbtree_fix8_unpack = mbtree_fix8_unpack;
|
||||||
|
|
||||||
|
#if HAVE_MMX
|
||||||
|
x264_mc_init_mmx( cpu, pf );
|
||||||
|
#endif
|
||||||
|
#if HAVE_ALTIVEC
|
||||||
|
if( cpu&X264_CPU_ALTIVEC )
|
||||||
|
x264_mc_init_altivec( pf );
|
||||||
|
#endif
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
x264_mc_init_arm( cpu, pf );
|
||||||
|
#endif
|
||||||
|
#if HAVE_AARCH64
|
||||||
|
x264_mc_init_aarch64( cpu, pf );
|
||||||
|
#endif
|
||||||
|
#if HAVE_MSA
|
||||||
|
if( cpu&X264_CPU_MSA )
|
||||||
|
x264_mc_init_mips( cpu, pf );
|
||||||
|
#endif
|
||||||
|
#if HAVE_LSX
|
||||||
|
x264_mc_init_loongarch( cpu, pf );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if( cpu_independent )
|
||||||
|
{
|
||||||
|
pf->mbtree_propagate_cost = mbtree_propagate_cost;
|
||||||
|
pf->mbtree_propagate_list = mbtree_propagate_list;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
|
||||||
|
{
|
||||||
|
const int b_interlaced = PARAM_INTERLACED;
|
||||||
|
int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
|
||||||
|
int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
|
||||||
|
|
||||||
|
if( mb_y & b_interlaced )
|
||||||
|
return;
|
||||||
|
|
||||||
|
for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
|
||||||
|
{
|
||||||
|
int stride = frame->i_stride[p];
|
||||||
|
const int width = frame->i_width[p];
|
||||||
|
int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
|
||||||
|
|
||||||
|
if( !b_interlaced || h->mb.b_adaptive_mbaff )
|
||||||
|
h->mc.hpel_filter(
|
||||||
|
frame->filtered[p][1] + offs,
|
||||||
|
frame->filtered[p][2] + offs,
|
||||||
|
frame->filtered[p][3] + offs,
|
||||||
|
frame->plane[p] + offs,
|
||||||
|
stride, width + 16, height - start,
|
||||||
|
h->scratch_buffer );
|
||||||
|
|
||||||
|
if( b_interlaced )
|
||||||
|
{
|
||||||
|
/* MC must happen between pixels in the same field. */
|
||||||
|
stride = frame->i_stride[p] << 1;
|
||||||
|
start = (mb_y*16 >> 1) - 8;
|
||||||
|
int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
|
||||||
|
offs = start*stride - 8;
|
||||||
|
for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
|
||||||
|
{
|
||||||
|
h->mc.hpel_filter(
|
||||||
|
frame->filtered_fld[p][1] + offs,
|
||||||
|
frame->filtered_fld[p][2] + offs,
|
||||||
|
frame->filtered_fld[p][3] + offs,
|
||||||
|
frame->plane_fld[p] + offs,
|
||||||
|
stride, width + 16, height_fld - start,
|
||||||
|
h->scratch_buffer );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* generate integral image:
|
||||||
|
* frame->integral contains 2 planes. in the upper plane, each element is
|
||||||
|
* the sum of an 8x8 pixel region with top-left corner on that point.
|
||||||
|
* in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
|
||||||
|
|
||||||
|
if( frame->integral )
|
||||||
|
{
|
||||||
|
int stride = frame->i_stride[0];
|
||||||
|
if( start < 0 )
|
||||||
|
{
|
||||||
|
memset( frame->integral - PADV * stride - PADH_ALIGN, 0, stride * sizeof(uint16_t) );
|
||||||
|
start = -PADV;
|
||||||
|
}
|
||||||
|
if( b_end )
|
||||||
|
height += PADV-9;
|
||||||
|
for( int y = start; y < height; y++ )
|
||||||
|
{
|
||||||
|
pixel *pix = frame->plane[0] + y * stride - PADH_ALIGN;
|
||||||
|
uint16_t *sum8 = frame->integral + (y+1) * stride - PADH_ALIGN;
|
||||||
|
uint16_t *sum4;
|
||||||
|
if( h->frames.b_have_sub8x8_esa )
|
||||||
|
{
|
||||||
|
h->mc.integral_init4h( sum8, pix, stride );
|
||||||
|
sum8 -= 8*stride;
|
||||||
|
sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
|
||||||
|
if( y >= 8-PADV )
|
||||||
|
h->mc.integral_init4v( sum8, sum4, stride );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
h->mc.integral_init8h( sum8, pix, stride );
|
||||||
|
if( y >= 8-PADV )
|
||||||
|
h->mc.integral_init8v( sum8-8*stride, stride );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
345
common/mc.h
Normal file
345
common/mc.h
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc.h: motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2004-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_MC_H
|
||||||
|
#define X264_MC_H
|
||||||
|
|
||||||
|
#define MC_CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
|
||||||
|
#define MC_CLIP_ADD2(s,x)\
|
||||||
|
do\
|
||||||
|
{\
|
||||||
|
MC_CLIP_ADD((s)[0], (x)[0]);\
|
||||||
|
MC_CLIP_ADD((s)[1], (x)[1]);\
|
||||||
|
} while( 0 )
|
||||||
|
|
||||||
|
#define x264_mbtree_propagate_list_internal_neon x264_template(mbtree_propagate_list_internal_neon)
|
||||||
|
#define PROPAGATE_LIST(cpu)\
|
||||||
|
void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
|
||||||
|
uint16_t *lowres_costs, int16_t *output,\
|
||||||
|
int bipred_weight, int mb_y, int len );\
|
||||||
|
\
|
||||||
|
static void mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
|
||||||
|
int16_t *propagate_amount, uint16_t *lowres_costs,\
|
||||||
|
int bipred_weight, int mb_y, int len, int list )\
|
||||||
|
{\
|
||||||
|
int16_t *current = h->scratch_buffer2;\
|
||||||
|
\
|
||||||
|
x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
|
||||||
|
current, bipred_weight, mb_y, len );\
|
||||||
|
\
|
||||||
|
unsigned stride = h->mb.i_mb_stride;\
|
||||||
|
unsigned width = h->mb.i_mb_width;\
|
||||||
|
unsigned height = h->mb.i_mb_height;\
|
||||||
|
\
|
||||||
|
for( int i = 0; i < len; current += 32 )\
|
||||||
|
{\
|
||||||
|
int end = X264_MIN( i+8, len );\
|
||||||
|
for( ; i < end; i++, current += 2 )\
|
||||||
|
{\
|
||||||
|
if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
|
||||||
|
continue;\
|
||||||
|
\
|
||||||
|
unsigned mbx = (unsigned)current[0];\
|
||||||
|
unsigned mby = (unsigned)current[1];\
|
||||||
|
unsigned idx0 = mbx + mby * stride;\
|
||||||
|
unsigned idx2 = idx0 + stride;\
|
||||||
|
\
|
||||||
|
/* Shortcut for the simple/common case of zero MV */\
|
||||||
|
if( !M32( mvs[i] ) )\
|
||||||
|
{\
|
||||||
|
MC_CLIP_ADD( ref_costs[idx0], current[16] );\
|
||||||
|
continue;\
|
||||||
|
}\
|
||||||
|
\
|
||||||
|
if( mbx < width-1 && mby < height-1 )\
|
||||||
|
{\
|
||||||
|
MC_CLIP_ADD2( ref_costs+idx0, current+16 );\
|
||||||
|
MC_CLIP_ADD2( ref_costs+idx2, current+32 );\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
{\
|
||||||
|
/* Note: this takes advantage of unsigned representation to\
|
||||||
|
* catch negative mbx/mby. */\
|
||||||
|
if( mby < height )\
|
||||||
|
{\
|
||||||
|
if( mbx < width )\
|
||||||
|
MC_CLIP_ADD( ref_costs[idx0+0], current[16] );\
|
||||||
|
if( mbx+1 < width )\
|
||||||
|
MC_CLIP_ADD( ref_costs[idx0+1], current[17] );\
|
||||||
|
}\
|
||||||
|
if( mby+1 < height )\
|
||||||
|
{\
|
||||||
|
if( mbx < width )\
|
||||||
|
MC_CLIP_ADD( ref_costs[idx2+0], current[32] );\
|
||||||
|
if( mbx+1 < width )\
|
||||||
|
MC_CLIP_ADD( ref_costs[idx2+1], current[33] );\
|
||||||
|
}\
|
||||||
|
}\
|
||||||
|
}\
|
||||||
|
}\
|
||||||
|
}
|
||||||
|
|
||||||
|
#define x264_plane_copy_c x264_template(plane_copy_c)
|
||||||
|
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||||
|
|
||||||
|
#define PLANE_COPY(align, cpu)\
|
||||||
|
static void plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
|
||||||
|
{\
|
||||||
|
int c_w = (align) / SIZEOF_PIXEL - 1;\
|
||||||
|
if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
|
||||||
|
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
|
||||||
|
else if( !(w&c_w) )\
|
||||||
|
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
|
||||||
|
else\
|
||||||
|
{\
|
||||||
|
if( --h > 0 )\
|
||||||
|
{\
|
||||||
|
if( i_src > 0 )\
|
||||||
|
{\
|
||||||
|
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
|
||||||
|
dst += i_dst * h;\
|
||||||
|
src += i_src * h;\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
|
||||||
|
}\
|
||||||
|
/* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
|
||||||
|
memcpy( dst, src, w*SIZEOF_PIXEL );\
|
||||||
|
}\
|
||||||
|
}
|
||||||
|
|
||||||
|
#define x264_plane_copy_swap_c x264_template(plane_copy_swap_c)
|
||||||
|
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||||
|
|
||||||
|
#define PLANE_COPY_SWAP(align, cpu)\
|
||||||
|
static void plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
|
||||||
|
{\
|
||||||
|
int c_w = (align>>1) / SIZEOF_PIXEL - 1;\
|
||||||
|
if( !(w&c_w) )\
|
||||||
|
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
|
||||||
|
else if( w > c_w )\
|
||||||
|
{\
|
||||||
|
if( --h > 0 )\
|
||||||
|
{\
|
||||||
|
if( i_src > 0 )\
|
||||||
|
{\
|
||||||
|
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
|
||||||
|
dst += i_dst * h;\
|
||||||
|
src += i_src * h;\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
|
||||||
|
}\
|
||||||
|
x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
|
||||||
|
for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
|
||||||
|
{\
|
||||||
|
dst[x] = src[x+1];\
|
||||||
|
dst[x+1] = src[x];\
|
||||||
|
}\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
|
||||||
|
}
|
||||||
|
|
||||||
|
#define x264_plane_copy_deinterleave_c x264_template(plane_copy_deinterleave_c)
|
||||||
|
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
|
||||||
|
/* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV
|
||||||
|
* input with the additional constraint that we cannot overread src. */
|
||||||
|
#define PLANE_COPY_YUYV(align, cpu)\
|
||||||
|
static void plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\
|
||||||
|
pixel *src, intptr_t i_src, int w, int h )\
|
||||||
|
{\
|
||||||
|
int c_w = (align>>1) / SIZEOF_PIXEL - 1;\
|
||||||
|
if( !(w&c_w) )\
|
||||||
|
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
|
||||||
|
else if( w > c_w )\
|
||||||
|
{\
|
||||||
|
if( --h > 0 )\
|
||||||
|
{\
|
||||||
|
if( i_src > 0 )\
|
||||||
|
{\
|
||||||
|
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
|
||||||
|
dsta += i_dsta * h;\
|
||||||
|
dstb += i_dstb * h;\
|
||||||
|
src += i_src * h;\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\
|
||||||
|
src+i_src, i_src, w, h );\
|
||||||
|
}\
|
||||||
|
x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
|
||||||
|
}
|
||||||
|
|
||||||
|
#define x264_plane_copy_interleave_c x264_template(plane_copy_interleave_c)
|
||||||
|
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
|
||||||
|
pixel *srcu, intptr_t i_srcu,
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||||
|
|
||||||
|
#define PLANE_INTERLEAVE(cpu) \
|
||||||
|
static void plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
|
||||||
|
pixel *srcu, intptr_t i_srcu,\
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h )\
|
||||||
|
{\
|
||||||
|
int c_w = 16 / SIZEOF_PIXEL - 1;\
|
||||||
|
if( !(w&c_w) )\
|
||||||
|
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
|
||||||
|
else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
|
||||||
|
{\
|
||||||
|
if( --h > 0 )\
|
||||||
|
{\
|
||||||
|
if( i_srcu > 0 )\
|
||||||
|
{\
|
||||||
|
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
|
||||||
|
dst += i_dst * h;\
|
||||||
|
srcu += i_srcu * h;\
|
||||||
|
srcv += i_srcv * h;\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
|
||||||
|
}\
|
||||||
|
x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
|
||||||
|
}
|
||||||
|
|
||||||
|
struct x264_weight_t;
|
||||||
|
typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
|
||||||
|
typedef struct x264_weight_t
|
||||||
|
{
|
||||||
|
/* aligning the first member is a gcc hack to force the struct to be
|
||||||
|
* 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
|
||||||
|
ALIGNED_16( int16_t cachea[8] );
|
||||||
|
int16_t cacheb[8];
|
||||||
|
int32_t i_denom;
|
||||||
|
int32_t i_scale;
|
||||||
|
int32_t i_offset;
|
||||||
|
weight_fn_t *weightfn;
|
||||||
|
} ALIGNED_16( x264_weight_t );
|
||||||
|
|
||||||
|
#define x264_weight_none ((const x264_weight_t*)x264_zero)
|
||||||
|
|
||||||
|
#define SET_WEIGHT( w, b, s, d, o )\
|
||||||
|
{\
|
||||||
|
(w).i_scale = (s);\
|
||||||
|
(w).i_denom = (d);\
|
||||||
|
(w).i_offset = (o);\
|
||||||
|
if( b )\
|
||||||
|
h->mc.weight_cache( h, &w );\
|
||||||
|
else\
|
||||||
|
w.weightfn = NULL;\
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Do the MC
|
||||||
|
* XXX: Only width = 4, 8 or 16 are valid
|
||||||
|
* width == 4 -> height == 4 or 8
|
||||||
|
* width == 8 -> height == 4 or 8 or 16
|
||||||
|
* width == 16-> height == 8 or 16
|
||||||
|
* */
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src,
|
||||||
|
int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
|
||||||
|
|
||||||
|
/* may round up the dimensions if they're not a power of 2 */
|
||||||
|
pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src,
|
||||||
|
int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
|
||||||
|
|
||||||
|
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
|
||||||
|
* so it must be run from left to right. */
|
||||||
|
void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,
|
||||||
|
int mvx, int mvy, int i_width, int i_height );
|
||||||
|
|
||||||
|
void (*avg[12])( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
|
||||||
|
pixel *src2, intptr_t src2_stride, int i_weight );
|
||||||
|
|
||||||
|
/* only 16x16, 8x8, and 4x4 defined */
|
||||||
|
void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
|
||||||
|
void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
|
||||||
|
|
||||||
|
void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||||
|
void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||||
|
|
||||||
|
void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
void (*plane_copy_interleave)( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
|
||||||
|
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||||
|
/* may write up to 15 pixels off the end of each plane */
|
||||||
|
void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
|
||||||
|
pixel *src, intptr_t i_src, int w, int h );
|
||||||
|
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
|
||||||
|
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
|
||||||
|
void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
|
||||||
|
pixel *dstc, intptr_t i_dstc,
|
||||||
|
uint32_t *src, intptr_t i_src, int w, int h );
|
||||||
|
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
|
||||||
|
intptr_t i_stride, int i_width, int i_height, int16_t *buf );
|
||||||
|
|
||||||
|
/* prefetch the next few macroblocks of fenc or fdec */
|
||||||
|
void (*prefetch_fenc) ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
|
||||||
|
void (*prefetch_fenc_400)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
|
||||||
|
void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
|
||||||
|
void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
|
||||||
|
/* prefetch the next few macroblocks of a hpel reference frame */
|
||||||
|
void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity );
|
||||||
|
|
||||||
|
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
|
||||||
|
void (*memzero_aligned)( void *dst, size_t n );
|
||||||
|
|
||||||
|
/* successive elimination prefilter */
|
||||||
|
void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride );
|
||||||
|
void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride );
|
||||||
|
void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
|
||||||
|
void (*integral_init8v)( uint16_t *sum8, intptr_t stride );
|
||||||
|
|
||||||
|
void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
|
||||||
|
intptr_t src_stride, intptr_t dst_stride, int width, int height );
|
||||||
|
weight_fn_t *weight;
|
||||||
|
weight_fn_t *offsetadd;
|
||||||
|
weight_fn_t *offsetsub;
|
||||||
|
void (*weight_cache)( x264_t *, x264_weight_t * );
|
||||||
|
|
||||||
|
void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
|
||||||
|
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
|
||||||
|
void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
|
||||||
|
int16_t *propagate_amount, uint16_t *lowres_costs,
|
||||||
|
int bipred_weight, int mb_y, int len, int list );
|
||||||
|
void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count );
|
||||||
|
void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count );
|
||||||
|
} x264_mc_functions_t;
|
||||||
|
|
||||||
|
#define x264_mc_init x264_template(mc_init)
|
||||||
|
void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent );
|
||||||
|
|
||||||
|
#endif
|
||||||
526
common/mips/dct-c.c
Normal file
526
common/mips/dct-c.c
Normal file
@@ -0,0 +1,526 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* dct-c.c: msa transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2015-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "macros.h"
|
||||||
|
#include "dct.h"
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 ) \
|
||||||
|
{ \
|
||||||
|
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
|
||||||
|
\
|
||||||
|
tmp0_m = in0 + in2; \
|
||||||
|
tmp1_m = in0 - in2; \
|
||||||
|
tmp2_m = in1 >> 1; \
|
||||||
|
tmp2_m = tmp2_m - in3; \
|
||||||
|
tmp3_m = in3 >> 1; \
|
||||||
|
tmp3_m = in1 + tmp3_m; \
|
||||||
|
\
|
||||||
|
BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 ); \
|
||||||
|
}
|
||||||
|
|
||||||
|
static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
|
||||||
|
int32_t i_src_stride )
|
||||||
|
{
|
||||||
|
v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
|
||||||
|
v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
|
||||||
|
v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
|
||||||
|
v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
|
||||||
|
|
||||||
|
LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
|
||||||
|
UNPCK_R_SH_SW( src0, src0_r );
|
||||||
|
UNPCK_R_SH_SW( src1, src1_r );
|
||||||
|
UNPCK_R_SH_SW( src2, src2_r );
|
||||||
|
UNPCK_R_SH_SW( src3, src3_r );
|
||||||
|
BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
|
||||||
|
tmp0, tmp3, tmp2, tmp1 );
|
||||||
|
BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
|
||||||
|
hor_res0, hor_res3, hor_res2, hor_res1 );
|
||||||
|
TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
|
||||||
|
hor_res0, hor_res1, hor_res2, hor_res3 );
|
||||||
|
BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
|
||||||
|
tmp0, tmp3, tmp2, tmp1 );
|
||||||
|
BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
|
||||||
|
ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
|
||||||
|
SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
|
||||||
|
PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
|
||||||
|
ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
|
||||||
|
ver_res0, ver_res1, ver_res2, ver_res3 );
|
||||||
|
PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
|
||||||
|
ST_SH2( ver_res0, ver_res2, p_dst, 8 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||||
|
uint8_t *p_ref, int32_t i_dst_stride,
|
||||||
|
int16_t *p_dst )
|
||||||
|
{
|
||||||
|
uint32_t i_src0, i_src1, i_src2, i_src3;
|
||||||
|
uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
|
||||||
|
v16i8 src = { 0 };
|
||||||
|
v16i8 ref = { 0 };
|
||||||
|
v16u8 inp0, inp1;
|
||||||
|
v8i16 diff0, diff1, diff2, diff3;
|
||||||
|
v8i16 temp0, temp1, temp2, temp3;
|
||||||
|
|
||||||
|
LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
|
||||||
|
LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
|
||||||
|
|
||||||
|
INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
|
||||||
|
INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
|
||||||
|
|
||||||
|
ILVRL_B2_UB( src, ref, inp0, inp1 );
|
||||||
|
|
||||||
|
HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
|
||||||
|
|
||||||
|
diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
|
||||||
|
diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
|
||||||
|
|
||||||
|
BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
|
||||||
|
|
||||||
|
diff0 = temp0 + temp1;
|
||||||
|
diff1 = ( temp3 << 1 ) + temp2;
|
||||||
|
diff2 = temp0 - temp1;
|
||||||
|
diff3 = temp3 - ( temp2 << 1 );
|
||||||
|
|
||||||
|
TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
|
||||||
|
temp0, temp1, temp2, temp3 );
|
||||||
|
BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
|
||||||
|
|
||||||
|
temp0 = diff0 + diff1;
|
||||||
|
temp1 = ( diff3 << 1 ) + diff2;
|
||||||
|
temp2 = diff0 - diff1;
|
||||||
|
temp3 = diff3 - ( diff2 << 1 );
|
||||||
|
|
||||||
|
ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
|
||||||
|
ST_UB2( inp0, inp1, p_dst, 8 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
|
||||||
|
int16_t pi_level[16] )
|
||||||
|
{
|
||||||
|
v8i16 src0, src1;
|
||||||
|
v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
|
||||||
|
v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
|
||||||
|
|
||||||
|
LD_SH2( pi_dct, 8, src0, src1 );
|
||||||
|
VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
|
||||||
|
ST_SH2( mask0, mask1, pi_level, 8 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
v8i16 src0, src1, src2, src3;
|
||||||
|
v8i16 hres0, hres1, hres2, hres3;
|
||||||
|
v8i16 vres0, vres1, vres2, vres3;
|
||||||
|
v8i16 zeros = { 0 };
|
||||||
|
|
||||||
|
LD4x4_SH( p_src, src0, src1, src2, src3 );
|
||||||
|
AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
|
||||||
|
TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
|
||||||
|
hres0, hres1, hres2, hres3 );
|
||||||
|
AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
|
||||||
|
SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
|
||||||
|
ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
|
||||||
|
ST_SH2( zeros, zeros, p_src, 8 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
int16_t i_dc;
|
||||||
|
uint32_t i_src0, i_src1, i_src2, i_src3;
|
||||||
|
v16u8 pred = { 0 };
|
||||||
|
v16i8 out;
|
||||||
|
v8i16 input_dc, pred_r, pred_l;
|
||||||
|
|
||||||
|
i_dc = ( p_src[0] + 32 ) >> 6;
|
||||||
|
input_dc = __msa_fill_h( i_dc );
|
||||||
|
p_src[ 0 ] = 0;
|
||||||
|
|
||||||
|
LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
|
||||||
|
INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
|
||||||
|
UNPCK_UB_SH( pred, pred_r, pred_l );
|
||||||
|
|
||||||
|
pred_r += input_dc;
|
||||||
|
pred_l += input_dc;
|
||||||
|
|
||||||
|
CLIP_SH2_0_255( pred_r, pred_l );
|
||||||
|
out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
|
||||||
|
ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||||
|
v8i16 vec0, vec1, vec2, vec3;
|
||||||
|
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||||
|
v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
|
||||||
|
v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
|
||||||
|
v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
|
||||||
|
v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
|
||||||
|
v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
|
||||||
|
v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
|
||||||
|
v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||||
|
v16i8 zeros = { 0 };
|
||||||
|
|
||||||
|
p_src[ 0 ] += 32;
|
||||||
|
|
||||||
|
LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
|
||||||
|
|
||||||
|
vec0 = src0 + src4;
|
||||||
|
vec1 = src0 - src4;
|
||||||
|
vec2 = src2 >> 1;
|
||||||
|
vec2 = vec2 - src6;
|
||||||
|
vec3 = src6 >> 1;
|
||||||
|
vec3 = src2 + vec3;
|
||||||
|
|
||||||
|
BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
|
||||||
|
|
||||||
|
vec0 = src7 >> 1;
|
||||||
|
vec0 = src5 - vec0 - src3 - src7;
|
||||||
|
vec1 = src3 >> 1;
|
||||||
|
vec1 = src1 - vec1 + src7 - src3;
|
||||||
|
vec2 = src5 >> 1;
|
||||||
|
vec2 = vec2 - src1 + src7 + src5;
|
||||||
|
vec3 = src1 >> 1;
|
||||||
|
vec3 = vec3 + src3 + src5 + src1;
|
||||||
|
tmp4 = vec3 >> 2;
|
||||||
|
tmp4 += vec0;
|
||||||
|
tmp5 = vec2 >> 2;
|
||||||
|
tmp5 += vec1;
|
||||||
|
tmp6 = vec1 >> 2;
|
||||||
|
tmp6 -= vec2;
|
||||||
|
tmp7 = vec0 >> 2;
|
||||||
|
tmp7 = vec3 - tmp7;
|
||||||
|
|
||||||
|
BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
|
||||||
|
res0, res1, res2, res3, res4, res5, res6, res7 );
|
||||||
|
TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
|
||||||
|
res0, res1, res2, res3, res4, res5, res6, res7 );
|
||||||
|
UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
|
||||||
|
UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
|
||||||
|
UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
|
||||||
|
UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
|
||||||
|
UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
|
||||||
|
UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
|
||||||
|
UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
|
||||||
|
UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
|
||||||
|
BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
|
||||||
|
vec0_r, vec0_l, vec1_l, vec1_r );
|
||||||
|
|
||||||
|
vec2_r = tmp2_r >> 1;
|
||||||
|
vec2_l = tmp2_l >> 1;
|
||||||
|
vec2_r -= tmp6_r;
|
||||||
|
vec2_l -= tmp6_l;
|
||||||
|
vec3_r = tmp6_r >> 1;
|
||||||
|
vec3_l = tmp6_l >> 1;
|
||||||
|
vec3_r += tmp2_r;
|
||||||
|
vec3_l += tmp2_l;
|
||||||
|
|
||||||
|
BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
|
||||||
|
tmp0_r, tmp2_r, tmp4_r, tmp6_r );
|
||||||
|
BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
|
||||||
|
tmp0_l, tmp2_l, tmp4_l, tmp6_l );
|
||||||
|
|
||||||
|
vec0_r = tmp7_r >> 1;
|
||||||
|
vec0_l = tmp7_l >> 1;
|
||||||
|
vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
|
||||||
|
vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
|
||||||
|
vec1_r = tmp3_r >> 1;
|
||||||
|
vec1_l = tmp3_l >> 1;
|
||||||
|
vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
|
||||||
|
vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
|
||||||
|
vec2_r = tmp5_r >> 1;
|
||||||
|
vec2_l = tmp5_l >> 1;
|
||||||
|
vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
|
||||||
|
vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
|
||||||
|
vec3_r = tmp1_r >> 1;
|
||||||
|
vec3_l = tmp1_l >> 1;
|
||||||
|
vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
|
||||||
|
vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
|
||||||
|
tmp1_r = vec3_r >> 2;
|
||||||
|
tmp1_l = vec3_l >> 2;
|
||||||
|
tmp1_r += vec0_r;
|
||||||
|
tmp1_l += vec0_l;
|
||||||
|
tmp3_r = vec2_r >> 2;
|
||||||
|
tmp3_l = vec2_l >> 2;
|
||||||
|
tmp3_r += vec1_r;
|
||||||
|
tmp3_l += vec1_l;
|
||||||
|
tmp5_r = vec1_r >> 2;
|
||||||
|
tmp5_l = vec1_l >> 2;
|
||||||
|
tmp5_r -= vec2_r;
|
||||||
|
tmp5_l -= vec2_l;
|
||||||
|
tmp7_r = vec0_r >> 2;
|
||||||
|
tmp7_l = vec0_l >> 2;
|
||||||
|
tmp7_r = vec3_r - tmp7_r;
|
||||||
|
tmp7_l = vec3_l - tmp7_l;
|
||||||
|
|
||||||
|
BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
|
||||||
|
res0_r, res0_l, res7_l, res7_r );
|
||||||
|
BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
|
||||||
|
res1_r, res1_l, res6_l, res6_r );
|
||||||
|
BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
|
||||||
|
res2_r, res2_l, res5_l, res5_r );
|
||||||
|
BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
|
||||||
|
res3_r, res3_l, res4_l, res4_r );
|
||||||
|
SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
|
||||||
|
SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
|
||||||
|
SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
|
||||||
|
SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
|
||||||
|
PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
|
||||||
|
res0, res1, res2, res3 );
|
||||||
|
PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
|
||||||
|
res4, res5, res6, res7 );
|
||||||
|
LD_SB8( p_dst, i_dst_stride,
|
||||||
|
dst0, dst1, dst2, dst3,
|
||||||
|
dst4, dst5, dst6, dst7 );
|
||||||
|
ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
|
||||||
|
tmp0, tmp1, tmp2, tmp3 );
|
||||||
|
ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
|
||||||
|
tmp4, tmp5, tmp6, tmp7 );
|
||||||
|
ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
|
||||||
|
res0, res1, res2, res3 );
|
||||||
|
ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
|
||||||
|
res4, res5, res6, res7 );
|
||||||
|
CLIP_SH4_0_255( res0, res1, res2, res3 );
|
||||||
|
CLIP_SH4_0_255( res4, res5, res6, res7 );
|
||||||
|
PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
|
||||||
|
dst0, dst1, dst2, dst3 );
|
||||||
|
ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
|
||||||
|
p_dst += ( 4 * i_dst_stride );
|
||||||
|
ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
|
||||||
|
int16_t *p_dst, int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
v8i16 src0, src1, src2, src3;
|
||||||
|
v4i32 src0_r, src1_r, src2_r, src3_r;
|
||||||
|
v4i32 hres0, hres1, hres2, hres3;
|
||||||
|
v8i16 vres0, vres1, vres2, vres3;
|
||||||
|
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||||
|
v2i64 res0, res1;
|
||||||
|
|
||||||
|
LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
|
||||||
|
UNPCK_R_SH_SW( src0, src0_r );
|
||||||
|
UNPCK_R_SH_SW( src1, src1_r );
|
||||||
|
UNPCK_R_SH_SW( src2, src2_r );
|
||||||
|
UNPCK_R_SH_SW( src3, src3_r );
|
||||||
|
BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
|
||||||
|
BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
|
||||||
|
TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
|
||||||
|
hres0, hres1, hres2, hres3 );
|
||||||
|
BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
|
||||||
|
BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
|
||||||
|
PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
|
||||||
|
vres0, vres1, vres2, vres3 );
|
||||||
|
PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
|
||||||
|
ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
|
||||||
|
}
|
||||||
|
|
||||||
|
static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||||
|
uint8_t *pred_ptr, int32_t i_pred_stride )
|
||||||
|
{
|
||||||
|
int16_t i_sum;
|
||||||
|
uint32_t i_src0, i_src1, i_src2, i_src3;
|
||||||
|
uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
|
||||||
|
v16i8 src = { 0 };
|
||||||
|
v16i8 pred = { 0 };
|
||||||
|
v16u8 src_l0, src_l1;
|
||||||
|
v8i16 diff0, diff1;
|
||||||
|
|
||||||
|
LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
|
||||||
|
LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
|
||||||
|
INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
|
||||||
|
INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
|
||||||
|
ILVRL_B2_UB( src, pred, src_l0, src_l1 );
|
||||||
|
HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
|
||||||
|
i_sum = HADD_UH_U32( diff0 + diff1 );
|
||||||
|
|
||||||
|
return i_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_dct4x4dc_msa( int16_t d[16] )
|
||||||
|
{
|
||||||
|
avc_dct4x4dc_msa( d, d, 4 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_idct4x4dc_msa( int16_t d[16] )
|
||||||
|
{
|
||||||
|
avc_idct4x4dc_msa( d, 4, d, 4 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
|
||||||
|
{
|
||||||
|
avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
|
||||||
|
{
|
||||||
|
avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
|
||||||
|
&pi_dct[2][0], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
|
||||||
|
&pi_dct[3][0], FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
|
||||||
|
{
|
||||||
|
x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
|
||||||
|
x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
|
||||||
|
x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
|
||||||
|
x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
|
||||||
|
{
|
||||||
|
avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
|
||||||
|
{
|
||||||
|
avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
|
||||||
|
avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
|
||||||
|
avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
|
||||||
|
&pi_dct[2][0], FDEC_STRIDE );
|
||||||
|
avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
|
||||||
|
&pi_dct[3][0], FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
|
||||||
|
{
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
|
||||||
|
&pi_dct[2], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
|
||||||
|
&pi_dct[3], FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
|
||||||
|
{
|
||||||
|
for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
|
||||||
|
{
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
|
||||||
|
avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
|
||||||
|
uint8_t *p_ref )
|
||||||
|
{
|
||||||
|
avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
|
||||||
|
uint8_t *p_ref )
|
||||||
|
{
|
||||||
|
avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
|
||||||
|
&p_ref[0], FDEC_STRIDE, p_dst[0] );
|
||||||
|
avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
|
||||||
|
FDEC_STRIDE, p_dst[1] );
|
||||||
|
avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
|
||||||
|
FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
|
||||||
|
FDEC_STRIDE, p_dst[2] );
|
||||||
|
avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
|
||||||
|
FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
|
||||||
|
FDEC_STRIDE, p_dst[3] );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
|
||||||
|
uint8_t *p_src,
|
||||||
|
uint8_t *p_ref )
|
||||||
|
{
|
||||||
|
x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
|
||||||
|
x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
|
||||||
|
x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
|
||||||
|
&p_ref[8*FDEC_STRIDE+0] );
|
||||||
|
x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
|
||||||
|
&p_ref[8*FDEC_STRIDE+8] );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
|
||||||
|
uint8_t *p_pix1, uint8_t *p_pix2 )
|
||||||
|
{
|
||||||
|
int32_t d0, d1, d2, d3;
|
||||||
|
|
||||||
|
pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
|
||||||
|
&p_pix2[0], FDEC_STRIDE );
|
||||||
|
pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
|
||||||
|
&p_pix2[4], FDEC_STRIDE );
|
||||||
|
pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||||
|
&p_pix2[4 * FDEC_STRIDE + 0],
|
||||||
|
FDEC_STRIDE );
|
||||||
|
pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||||
|
&p_pix2[4 * FDEC_STRIDE + 4],
|
||||||
|
FDEC_STRIDE );
|
||||||
|
|
||||||
|
BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
|
||||||
|
BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
|
||||||
|
uint8_t *p_pix1, uint8_t *p_pix2 )
|
||||||
|
{
|
||||||
|
int32_t a0, a1, a2, a3, a4, a5, a6, a7;
|
||||||
|
int32_t b0, b1, b2, b3, b4, b5, b6, b7;
|
||||||
|
|
||||||
|
a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||||
|
&p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||||
|
a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||||
|
&p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||||
|
a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||||
|
&p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||||
|
a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||||
|
&p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||||
|
a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||||
|
&p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||||
|
a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||||
|
&p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||||
|
a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||||
|
&p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||||
|
a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||||
|
&p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||||
|
|
||||||
|
BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
|
||||||
|
b0, b1, b2, b3, b7, b6, b5, b4 );
|
||||||
|
BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
|
||||||
|
a0, a1, a2, a3, a7, a6, a5, a4 );
|
||||||
|
BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
|
||||||
|
pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
|
||||||
|
pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
|
||||||
|
{
|
||||||
|
avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
|
||||||
|
}
|
||||||
|
#endif
|
||||||
64
common/mips/dct.h
Normal file
64
common/mips/dct.h
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* dct.h: msa transform and zigzag
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2015-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_MIPS_DCT_H
|
||||||
|
#define X264_MIPS_DCT_H
|
||||||
|
|
||||||
|
#define x264_dct4x4dc_msa x264_template(dct4x4dc_msa)
|
||||||
|
void x264_dct4x4dc_msa( int16_t d[16] );
|
||||||
|
#define x264_idct4x4dc_msa x264_template(idct4x4dc_msa)
|
||||||
|
void x264_idct4x4dc_msa( int16_t d[16] );
|
||||||
|
#define x264_add4x4_idct_msa x264_template(add4x4_idct_msa)
|
||||||
|
void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] );
|
||||||
|
#define x264_add8x8_idct_msa x264_template(add8x8_idct_msa)
|
||||||
|
void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] );
|
||||||
|
#define x264_add16x16_idct_msa x264_template(add16x16_idct_msa)
|
||||||
|
void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] );
|
||||||
|
#define x264_add8x8_idct8_msa x264_template(add8x8_idct8_msa)
|
||||||
|
void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] );
|
||||||
|
#define x264_add16x16_idct8_msa x264_template(add16x16_idct8_msa)
|
||||||
|
void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] );
|
||||||
|
#define x264_add8x8_idct_dc_msa x264_template(add8x8_idct_dc_msa)
|
||||||
|
void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] );
|
||||||
|
#define x264_add16x16_idct_dc_msa x264_template(add16x16_idct_dc_msa)
|
||||||
|
void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] );
|
||||||
|
#define x264_sub4x4_dct_msa x264_template(sub4x4_dct_msa)
|
||||||
|
void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
|
||||||
|
#define x264_sub8x8_dct_msa x264_template(sub8x8_dct_msa)
|
||||||
|
void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
|
||||||
|
uint8_t *p_ref );
|
||||||
|
#define x264_sub16x16_dct_msa x264_template(sub16x16_dct_msa)
|
||||||
|
void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src,
|
||||||
|
uint8_t *p_ref );
|
||||||
|
#define x264_sub8x8_dct_dc_msa x264_template(sub8x8_dct_dc_msa)
|
||||||
|
void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1,
|
||||||
|
uint8_t *p_pix2 );
|
||||||
|
#define x264_sub8x16_dct_dc_msa x264_template(sub8x16_dct_dc_msa)
|
||||||
|
void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1,
|
||||||
|
uint8_t *p_pix2 );
|
||||||
|
#define x264_zigzag_scan_4x4_frame_msa x264_template(zigzag_scan_4x4_frame_msa)
|
||||||
|
void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] );
|
||||||
|
|
||||||
|
#endif
|
||||||
2011
common/mips/deblock-c.c
Normal file
2011
common/mips/deblock-c.c
Normal file
File diff suppressed because it is too large
Load Diff
52
common/mips/deblock.h
Normal file
52
common/mips/deblock.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* deblock.h: msa deblocking
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2017-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_MIPS_DEBLOCK_H
|
||||||
|
#define X264_MIPS_DEBLOCK_H
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
#define x264_deblock_v_luma_msa x264_template(deblock_v_luma_msa)
|
||||||
|
void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_luma_msa x264_template(deblock_h_luma_msa)
|
||||||
|
void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_v_chroma_msa x264_template(deblock_v_chroma_msa)
|
||||||
|
void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_h_chroma_msa x264_template(deblock_h_chroma_msa)
|
||||||
|
void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||||
|
#define x264_deblock_v_luma_intra_msa x264_template(deblock_v_luma_intra_msa)
|
||||||
|
void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_luma_intra_msa x264_template(deblock_h_luma_intra_msa)
|
||||||
|
void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_v_chroma_intra_msa x264_template(deblock_v_chroma_intra_msa)
|
||||||
|
void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_h_chroma_intra_msa x264_template(deblock_h_chroma_intra_msa)
|
||||||
|
void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||||
|
#define x264_deblock_strength_msa x264_template(deblock_strength_msa)
|
||||||
|
void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||||
|
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
|
||||||
|
int bframe );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
1952
common/mips/macros.h
Normal file
1952
common/mips/macros.h
Normal file
File diff suppressed because it is too large
Load Diff
3696
common/mips/mc-c.c
Normal file
3696
common/mips/mc-c.c
Normal file
File diff suppressed because it is too large
Load Diff
32
common/mips/mc.h
Normal file
32
common/mips/mc.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* mc.h: msa motion compensation
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2015-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Neha Rana <neha.rana@imgtec.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_MIPS_MC_H
|
||||||
|
#define X264_MIPS_MC_H
|
||||||
|
|
||||||
|
#define x264_mc_init_mips x264_template(mc_init_mips)
|
||||||
|
void x264_mc_init_mips( uint32_t cpu, x264_mc_functions_t *pf );
|
||||||
|
|
||||||
|
#endif
|
||||||
1491
common/mips/pixel-c.c
Normal file
1491
common/mips/pixel-c.c
Normal file
File diff suppressed because it is too large
Load Diff
228
common/mips/pixel.h
Normal file
228
common/mips/pixel.h
Normal file
@@ -0,0 +1,228 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* pixel.h: msa pixel metrics
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2015-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef X264_MIPS_PIXEL_H
|
||||||
|
#define X264_MIPS_PIXEL_H
|
||||||
|
|
||||||
|
#define x264_pixel_sad_16x16_msa x264_template(pixel_sad_16x16_msa)
|
||||||
|
int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_16x8_msa x264_template(pixel_sad_16x8_msa)
|
||||||
|
int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_8x16_msa x264_template(pixel_sad_8x16_msa)
|
||||||
|
int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_8x8_msa x264_template(pixel_sad_8x8_msa)
|
||||||
|
int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_8x4_msa x264_template(pixel_sad_8x4_msa)
|
||||||
|
int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_4x16_msa x264_template(pixel_sad_4x16_msa)
|
||||||
|
int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_4x8_msa x264_template(pixel_sad_4x8_msa)
|
||||||
|
int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_4x4_msa x264_template(pixel_sad_4x4_msa)
|
||||||
|
int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_sad_x4_16x16_msa x264_template(pixel_sad_x4_16x16_msa)
|
||||||
|
void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_16x8_msa x264_template(pixel_sad_x4_16x8_msa)
|
||||||
|
void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x16_msa x264_template(pixel_sad_x4_8x16_msa)
|
||||||
|
void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x8_msa x264_template(pixel_sad_x4_8x8_msa)
|
||||||
|
void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_8x4_msa x264_template(pixel_sad_x4_8x4_msa)
|
||||||
|
void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_4x8_msa x264_template(pixel_sad_x4_4x8_msa)
|
||||||
|
void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x4_4x4_msa x264_template(pixel_sad_x4_4x4_msa)
|
||||||
|
void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[4] );
|
||||||
|
#define x264_pixel_sad_x3_16x16_msa x264_template(pixel_sad_x3_16x16_msa)
|
||||||
|
void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_16x8_msa x264_template(pixel_sad_x3_16x8_msa)
|
||||||
|
void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_8x16_msa x264_template(pixel_sad_x3_8x16_msa)
|
||||||
|
void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_8x8_msa x264_template(pixel_sad_x3_8x8_msa)
|
||||||
|
void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_8x4_msa x264_template(pixel_sad_x3_8x4_msa)
|
||||||
|
void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_4x8_msa x264_template(pixel_sad_x3_4x8_msa)
|
||||||
|
void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_sad_x3_4x4_msa x264_template(pixel_sad_x3_4x4_msa)
|
||||||
|
void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||||
|
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||||
|
intptr_t i_ref_stride,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_ssd_16x16_msa x264_template(pixel_ssd_16x16_msa)
|
||||||
|
int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_16x8_msa x264_template(pixel_ssd_16x8_msa)
|
||||||
|
int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x16_msa x264_template(pixel_ssd_8x16_msa)
|
||||||
|
int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x8_msa x264_template(pixel_ssd_8x8_msa)
|
||||||
|
int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_8x4_msa x264_template(pixel_ssd_8x4_msa)
|
||||||
|
int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_4x16_msa x264_template(pixel_ssd_4x16_msa)
|
||||||
|
int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_4x8_msa x264_template(pixel_ssd_4x8_msa)
|
||||||
|
int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_pixel_ssd_4x4_msa x264_template(pixel_ssd_4x4_msa)
|
||||||
|
int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||||
|
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||||
|
#define x264_intra_sad_x3_4x4_msa x264_template(intra_sad_x3_4x4_msa)
|
||||||
|
void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sad_x3_16x16_msa x264_template(intra_sad_x3_16x16_msa)
|
||||||
|
void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sad_x3_8x8_msa x264_template(intra_sad_x3_8x8_msa)
|
||||||
|
void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sad_x3_8x8c_msa x264_template(intra_sad_x3_8x8c_msa)
|
||||||
|
void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_ssim_4x4x2_core_msa x264_template(ssim_4x4x2_core_msa)
|
||||||
|
void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
|
||||||
|
const uint8_t *p_pix2, intptr_t i_stride2,
|
||||||
|
int32_t i_sums[2][4] );
|
||||||
|
#define x264_pixel_hadamard_ac_8x8_msa x264_template(pixel_hadamard_ac_8x8_msa)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_8x16_msa x264_template(pixel_hadamard_ac_8x16_msa)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_16x8_msa x264_template(pixel_hadamard_ac_16x8_msa)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_hadamard_ac_16x16_msa x264_template(pixel_hadamard_ac_16x16_msa)
|
||||||
|
uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_satd_4x4_msa x264_template(pixel_satd_4x4_msa)
|
||||||
|
int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_4x8_msa x264_template(pixel_satd_4x8_msa)
|
||||||
|
int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_4x16_msa x264_template(pixel_satd_4x16_msa)
|
||||||
|
int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x4_msa x264_template(pixel_satd_8x4_msa)
|
||||||
|
int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x8_msa x264_template(pixel_satd_8x8_msa)
|
||||||
|
int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_8x16_msa x264_template(pixel_satd_8x16_msa)
|
||||||
|
int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_16x8_msa x264_template(pixel_satd_16x8_msa)
|
||||||
|
int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_satd_16x16_msa x264_template(pixel_satd_16x16_msa)
|
||||||
|
int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_sa8d_8x8_msa x264_template(pixel_sa8d_8x8_msa)
|
||||||
|
int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_pixel_sa8d_16x16_msa x264_template(pixel_sa8d_16x16_msa)
|
||||||
|
int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||||
|
#define x264_intra_satd_x3_4x4_msa x264_template(intra_satd_x3_4x4_msa)
|
||||||
|
void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_satd_x3_16x16_msa x264_template(intra_satd_x3_16x16_msa)
|
||||||
|
void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_sa8d_x3_8x8_msa x264_template(intra_sa8d_x3_8x8_msa)
|
||||||
|
void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_intra_satd_x3_8x8c_msa x264_template(intra_satd_x3_8x8c_msa)
|
||||||
|
void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||||
|
int32_t p_sad_array[3] );
|
||||||
|
#define x264_pixel_var_16x16_msa x264_template(pixel_var_16x16_msa)
|
||||||
|
uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_var_8x16_msa x264_template(pixel_var_8x16_msa)
|
||||||
|
uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_var_8x8_msa x264_template(pixel_var_8x8_msa)
|
||||||
|
uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||||
|
#define x264_pixel_var2_8x16_msa x264_template(pixel_var2_8x16_msa)
|
||||||
|
int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2,
|
||||||
|
int32_t *p_ssd );
|
||||||
|
#define x264_pixel_var2_8x8_msa x264_template(pixel_var2_8x8_msa)
|
||||||
|
int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
|
||||||
|
uint8_t *p_pix2, intptr_t i_stride2,
|
||||||
|
int32_t *p_ssd );
|
||||||
|
|
||||||
|
#endif
|
||||||
608
common/mips/predict-c.c
Normal file
608
common/mips/predict-c.c
Normal file
@@ -0,0 +1,608 @@
|
|||||||
|
/*****************************************************************************
|
||||||
|
* predict-c.c: msa intra prediction
|
||||||
|
*****************************************************************************
|
||||||
|
* Copyright (C) 2015-2025 x264 project
|
||||||
|
*
|
||||||
|
* Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||||
|
*
|
||||||
|
* This program is also available under a commercial proprietary license.
|
||||||
|
* For more information, contact us at licensing@x264.com.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common/common.h"
|
||||||
|
#include "macros.h"
|
||||||
|
#include "predict.h"
|
||||||
|
|
||||||
|
#if !HIGH_BIT_DEPTH
|
||||||
|
static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
uint32_t u_src_data;
|
||||||
|
|
||||||
|
u_src_data = LW( p_src );
|
||||||
|
|
||||||
|
SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_out;
|
||||||
|
|
||||||
|
u_out = LD( p_src );
|
||||||
|
|
||||||
|
SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
|
||||||
|
p_dst += ( 4 * i_dst_stride );
|
||||||
|
SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
v16u8 src0 = LD_UB( p_src );
|
||||||
|
|
||||||
|
ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
|
||||||
|
i_dst_stride );
|
||||||
|
p_dst += ( 8 * i_dst_stride );
|
||||||
|
ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
|
||||||
|
i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||||
|
uint8_t *p_dst, int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
uint32_t u_out0, u_out1, u_out2, u_out3;
|
||||||
|
|
||||||
|
u_out0 = p_src[0 * i_src_stride] * 0x01010101;
|
||||||
|
u_out1 = p_src[1 * i_src_stride] * 0x01010101;
|
||||||
|
u_out2 = p_src[2 * i_src_stride] * 0x01010101;
|
||||||
|
u_out3 = p_src[3 * i_src_stride] * 0x01010101;
|
||||||
|
|
||||||
|
SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||||
|
uint8_t *p_dst, int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
|
||||||
|
|
||||||
|
u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
|
||||||
|
|
||||||
|
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||||
|
p_dst += ( 4 * i_dst_stride );
|
||||||
|
SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||||
|
uint8_t *p_dst,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
uint32_t u_row;
|
||||||
|
uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
|
||||||
|
v16u8 src0, src1, src2, src3;
|
||||||
|
|
||||||
|
for( u_row = 4; u_row--; )
|
||||||
|
{
|
||||||
|
u_inp0 = p_src[0];
|
||||||
|
p_src += i_src_stride;
|
||||||
|
u_inp1 = p_src[0];
|
||||||
|
p_src += i_src_stride;
|
||||||
|
u_inp2 = p_src[0];
|
||||||
|
p_src += i_src_stride;
|
||||||
|
u_inp3 = p_src[0];
|
||||||
|
p_src += i_src_stride;
|
||||||
|
|
||||||
|
src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
|
||||||
|
src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
|
||||||
|
src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
|
||||||
|
src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
|
||||||
|
|
||||||
|
ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
|
||||||
|
p_dst += ( 4 * i_dst_stride );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
|
||||||
|
int32_t i_src_stride_left,
|
||||||
|
uint8_t *p_dst, int32_t i_dst_stride,
|
||||||
|
uint8_t is_above, uint8_t is_left )
|
||||||
|
{
|
||||||
|
uint32_t u_row;
|
||||||
|
uint32_t u_out, u_addition = 0;
|
||||||
|
v16u8 src_above, store;
|
||||||
|
v8u16 sum_above;
|
||||||
|
v4u32 sum;
|
||||||
|
|
||||||
|
if( is_left && is_above )
|
||||||
|
{
|
||||||
|
src_above = LD_UB( p_src_top );
|
||||||
|
|
||||||
|
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||||
|
sum = __msa_hadd_u_w( sum_above, sum_above );
|
||||||
|
u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
|
||||||
|
|
||||||
|
for( u_row = 0; u_row < 4; u_row++ )
|
||||||
|
{
|
||||||
|
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||||
|
}
|
||||||
|
|
||||||
|
u_addition = ( u_addition + 4 ) >> 3;
|
||||||
|
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||||
|
}
|
||||||
|
else if( is_left )
|
||||||
|
{
|
||||||
|
for( u_row = 0; u_row < 4; u_row++ )
|
||||||
|
{
|
||||||
|
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||||
|
}
|
||||||
|
|
||||||
|
u_addition = ( u_addition + 2 ) >> 2;
|
||||||
|
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||||
|
}
|
||||||
|
else if( is_above )
|
||||||
|
{
|
||||||
|
src_above = LD_UB( p_src_top );
|
||||||
|
|
||||||
|
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||||
|
sum = __msa_hadd_u_w( sum_above, sum_above );
|
||||||
|
sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
|
||||||
|
store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
store = ( v16u8 ) __msa_ldi_b( 128 );
|
||||||
|
}
|
||||||
|
|
||||||
|
u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
|
||||||
|
|
||||||
|
SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
|
||||||
|
uint8_t *p_dst, int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
uint64_t u_val0, u_val1;
|
||||||
|
v16i8 store;
|
||||||
|
v16u8 src = { 0 };
|
||||||
|
v8u16 sum_h;
|
||||||
|
v4u32 sum_w;
|
||||||
|
v2u64 sum_d;
|
||||||
|
|
||||||
|
u_val0 = LD( p_src_top );
|
||||||
|
u_val1 = LD( p_src_left );
|
||||||
|
INSERT_D2_UB( u_val0, u_val1, src );
|
||||||
|
sum_h = __msa_hadd_u_h( src, src );
|
||||||
|
sum_w = __msa_hadd_u_w( sum_h, sum_h );
|
||||||
|
sum_d = __msa_hadd_u_d( sum_w, sum_w );
|
||||||
|
sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
|
||||||
|
sum_d = __msa_hadd_u_d( sum_w, sum_w );
|
||||||
|
sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
|
||||||
|
store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
|
||||||
|
u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
|
||||||
|
|
||||||
|
SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
|
||||||
|
p_dst += ( 4 * i_dst_stride );
|
||||||
|
SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
|
||||||
|
int32_t i_src_stride_left,
|
||||||
|
uint8_t *p_dst, int32_t i_dst_stride,
|
||||||
|
uint8_t is_above, uint8_t is_left )
|
||||||
|
{
|
||||||
|
uint32_t u_row;
|
||||||
|
uint32_t u_addition = 0;
|
||||||
|
v16u8 src_above, store;
|
||||||
|
v8u16 sum_above;
|
||||||
|
v4u32 sum_top;
|
||||||
|
v2u64 sum;
|
||||||
|
|
||||||
|
if( is_left && is_above )
|
||||||
|
{
|
||||||
|
src_above = LD_UB( p_src_top );
|
||||||
|
|
||||||
|
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||||
|
sum_top = __msa_hadd_u_w( sum_above, sum_above );
|
||||||
|
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||||
|
sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
|
||||||
|
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||||
|
u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
|
||||||
|
|
||||||
|
for( u_row = 0; u_row < 16; u_row++ )
|
||||||
|
{
|
||||||
|
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||||
|
}
|
||||||
|
|
||||||
|
u_addition = ( u_addition + 16 ) >> 5;
|
||||||
|
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||||
|
}
|
||||||
|
else if( is_left )
|
||||||
|
{
|
||||||
|
for( u_row = 0; u_row < 16; u_row++ )
|
||||||
|
{
|
||||||
|
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||||
|
}
|
||||||
|
|
||||||
|
u_addition = ( u_addition + 8 ) >> 4;
|
||||||
|
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||||
|
}
|
||||||
|
else if( is_above )
|
||||||
|
{
|
||||||
|
src_above = LD_UB( p_src_top );
|
||||||
|
|
||||||
|
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||||
|
sum_top = __msa_hadd_u_w( sum_above, sum_above );
|
||||||
|
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||||
|
sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
|
||||||
|
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||||
|
sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
|
||||||
|
store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
store = ( v16u8 ) __msa_ldi_b( 128 );
|
||||||
|
}
|
||||||
|
|
||||||
|
ST_UB8( store, store, store, store, store, store, store, store, p_dst,
|
||||||
|
i_dst_stride );
|
||||||
|
p_dst += ( 8 * i_dst_stride );
|
||||||
|
ST_UB8( store, store, store, store, store, store, store, store, p_dst,
|
||||||
|
i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
|
||||||
|
{
|
||||||
|
uint8_t u_lpcnt;
|
||||||
|
int32_t i_res, i_res0, i_res1, i_res2, i_res3;
|
||||||
|
uint64_t u_out0, u_out1;
|
||||||
|
v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
|
||||||
|
v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
|
||||||
|
v4i32 int_multiplier = { 0, 1, 2, 3 };
|
||||||
|
v16u8 p_src_top;
|
||||||
|
v8i16 vec9, vec10, vec11;
|
||||||
|
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
|
||||||
|
v2i64 sum;
|
||||||
|
|
||||||
|
p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
|
||||||
|
p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
|
||||||
|
( v16i8 ) p_src_top );
|
||||||
|
|
||||||
|
vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
|
||||||
|
vec9 *= short_multiplier;
|
||||||
|
vec8 = __msa_hadd_s_w( vec9, vec9 );
|
||||||
|
sum = __msa_hadd_s_d( vec8, vec8 );
|
||||||
|
|
||||||
|
i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
|
||||||
|
|
||||||
|
i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
|
||||||
|
2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
|
||||||
|
3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
|
||||||
|
4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
|
||||||
|
|
||||||
|
i_res0 *= 17;
|
||||||
|
i_res1 *= 17;
|
||||||
|
i_res0 = ( i_res0 + 16 ) >> 5;
|
||||||
|
i_res1 = ( i_res1 + 16 ) >> 5;
|
||||||
|
|
||||||
|
i_res3 = 3 * ( i_res0 + i_res1 );
|
||||||
|
i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
|
||||||
|
i_res = i_res2 - i_res3;
|
||||||
|
|
||||||
|
vec8 = __msa_fill_w( i_res0 );
|
||||||
|
vec4 = __msa_fill_w( i_res );
|
||||||
|
vec2 = __msa_fill_w( i_res1 );
|
||||||
|
vec5 = vec8 * int_multiplier;
|
||||||
|
vec3 = vec8 * 4;
|
||||||
|
|
||||||
|
for( u_lpcnt = 4; u_lpcnt--; )
|
||||||
|
{
|
||||||
|
vec0 = vec5;
|
||||||
|
vec0 += vec4;
|
||||||
|
vec1 = vec0 + vec3;
|
||||||
|
vec6 = vec5;
|
||||||
|
vec4 += vec2;
|
||||||
|
vec6 += vec4;
|
||||||
|
vec7 = vec6 + vec3;
|
||||||
|
|
||||||
|
SRA_4V( vec0, vec1, vec6, vec7, 5 );
|
||||||
|
PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
|
||||||
|
CLIP_SH2_0_255( vec10, vec11 );
|
||||||
|
PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
|
||||||
|
|
||||||
|
u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
|
||||||
|
u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
|
||||||
|
SD( u_out0, p_src );
|
||||||
|
p_src += i_stride;
|
||||||
|
SD( u_out1, p_src );
|
||||||
|
p_src += i_stride;
|
||||||
|
|
||||||
|
vec4 += vec2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
|
||||||
|
{
|
||||||
|
uint8_t u_lpcnt;
|
||||||
|
int32_t i_res0, i_res1, i_res2, i_res3;
|
||||||
|
uint64_t u_load0, u_load1;
|
||||||
|
v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
|
||||||
|
v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
|
||||||
|
v4i32 int_multiplier = { 0, 1, 2, 3 };
|
||||||
|
v16u8 p_src_top = { 0 };
|
||||||
|
v8i16 vec9, vec10;
|
||||||
|
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
|
||||||
|
|
||||||
|
u_load0 = LD( p_src - ( i_stride + 1 ) );
|
||||||
|
u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
|
||||||
|
|
||||||
|
INSERT_D2_UB( u_load0, u_load1, p_src_top );
|
||||||
|
|
||||||
|
p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
|
||||||
|
( v16i8 ) p_src_top );
|
||||||
|
|
||||||
|
vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
|
||||||
|
vec9 *= short_multiplier;
|
||||||
|
vec8 = __msa_hadd_s_w( vec9, vec9 );
|
||||||
|
res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
|
||||||
|
|
||||||
|
i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
|
||||||
|
|
||||||
|
i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
|
||||||
|
2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
|
||||||
|
3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
|
||||||
|
4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
|
||||||
|
5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
|
||||||
|
6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
|
||||||
|
7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
|
||||||
|
8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
|
||||||
|
|
||||||
|
i_res0 *= 5;
|
||||||
|
i_res1 *= 5;
|
||||||
|
i_res0 = ( i_res0 + 32 ) >> 6;
|
||||||
|
i_res1 = ( i_res1 + 32 ) >> 6;
|
||||||
|
|
||||||
|
i_res3 = 7 * ( i_res0 + i_res1 );
|
||||||
|
i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
|
||||||
|
i_res2 -= i_res3;
|
||||||
|
|
||||||
|
vec8 = __msa_fill_w( i_res0 );
|
||||||
|
vec4 = __msa_fill_w( i_res2 );
|
||||||
|
vec5 = __msa_fill_w( i_res1 );
|
||||||
|
vec6 = vec8 * 4;
|
||||||
|
vec7 = vec8 * int_multiplier;
|
||||||
|
|
||||||
|
for( u_lpcnt = 16; u_lpcnt--; )
|
||||||
|
{
|
||||||
|
vec0 = vec7;
|
||||||
|
vec0 += vec4;
|
||||||
|
vec1 = vec0 + vec6;
|
||||||
|
vec2 = vec1 + vec6;
|
||||||
|
vec3 = vec2 + vec6;
|
||||||
|
|
||||||
|
SRA_4V( vec0, vec1, vec2, vec3, 5 );
|
||||||
|
PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
|
||||||
|
CLIP_SH2_0_255( vec9, vec10 );
|
||||||
|
PCKEV_ST_SB( vec9, vec10, p_src );
|
||||||
|
p_src += i_stride;
|
||||||
|
|
||||||
|
vec4 += vec5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
|
||||||
|
{
|
||||||
|
uint8_t u_lp_cnt;
|
||||||
|
uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
|
||||||
|
uint32_t u_out0, u_out1, u_out2, u_out3;
|
||||||
|
v16u8 p_src_top;
|
||||||
|
v8u16 add;
|
||||||
|
v4u32 sum;
|
||||||
|
|
||||||
|
p_src_top = LD_UB( p_src - i_stride );
|
||||||
|
add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
|
||||||
|
sum = __msa_hadd_u_w( add, add );
|
||||||
|
u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
|
||||||
|
u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
|
||||||
|
|
||||||
|
for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
|
||||||
|
{
|
||||||
|
u_src0 += p_src[u_lp_cnt * i_stride - 1];
|
||||||
|
u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
u_src0 = ( u_src0 + 4 ) >> 3;
|
||||||
|
u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
|
||||||
|
u_src1 = ( u_src1 + 2 ) >> 2;
|
||||||
|
u_src2 = ( u_src2 + 2 ) >> 2;
|
||||||
|
|
||||||
|
u_out0 = u_src0 * 0x01010101;
|
||||||
|
u_out1 = u_src1 * 0x01010101;
|
||||||
|
u_out2 = u_src2 * 0x01010101;
|
||||||
|
u_out3 = u_src3 * 0x01010101;
|
||||||
|
|
||||||
|
for( u_lp_cnt = 4; u_lp_cnt--; )
|
||||||
|
{
|
||||||
|
SW( u_out0, p_src );
|
||||||
|
SW( u_out1, ( p_src + 4 ) );
|
||||||
|
SW( u_out2, ( p_src + 4 * i_stride ) );
|
||||||
|
SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
|
||||||
|
p_src += i_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
uint8_t u_src_val = p_src[15];
|
||||||
|
uint64_t u_out0, u_out1, u_out2, u_out3;
|
||||||
|
v16u8 src, vec4, vec5, res0;
|
||||||
|
v8u16 vec0, vec1, vec2, vec3;
|
||||||
|
v2i64 res1, res2, res3;
|
||||||
|
|
||||||
|
src = LD_UB( p_src );
|
||||||
|
|
||||||
|
vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
|
||||||
|
vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
|
||||||
|
vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
|
||||||
|
ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
|
||||||
|
ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
|
||||||
|
HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
|
||||||
|
|
||||||
|
vec0 += vec1;
|
||||||
|
vec2 += vec3;
|
||||||
|
vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
|
||||||
|
vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
|
||||||
|
|
||||||
|
res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
|
||||||
|
res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
|
||||||
|
res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
|
||||||
|
res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
|
||||||
|
|
||||||
|
u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
|
||||||
|
u_out1 = __msa_copy_u_d( res1, 0 );
|
||||||
|
u_out2 = __msa_copy_u_d( res2, 0 );
|
||||||
|
u_out3 = __msa_copy_u_d( res3, 0 );
|
||||||
|
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||||
|
p_dst += ( 4 * i_dst_stride );
|
||||||
|
|
||||||
|
res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
|
||||||
|
res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
|
||||||
|
res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
|
||||||
|
res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
|
||||||
|
|
||||||
|
u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
|
||||||
|
u_out1 = __msa_copy_u_d( res1, 0 );
|
||||||
|
u_out2 = __msa_copy_u_d( res2, 0 );
|
||||||
|
u_out3 = __msa_copy_u_d( res3, 0 );
|
||||||
|
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
|
||||||
|
int32_t i_dst_stride )
|
||||||
|
{
|
||||||
|
v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
|
||||||
|
|
||||||
|
ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
|
||||||
|
p_dst += ( 8 * i_dst_stride );
|
||||||
|
ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||||
|
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||||
|
FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||||
|
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
|
||||||
|
p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
|
||||||
|
p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||||
|
{
|
||||||
|
intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||||
|
{
|
||||||
|
intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
|
||||||
|
p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||||
|
{
|
||||||
|
intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||||
|
{
|
||||||
|
intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||||
|
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
|
||||||
|
p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
|
||||||
|
{
|
||||||
|
intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
|
||||||
|
}
|
||||||
|
#endif
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user