x264 source for verification 2026-05-22

This commit is contained in:
2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions

51
.gitignore vendored Normal file
View File

@@ -0,0 +1,51 @@
*~
*.a
*.d
*.diff
*.orig
*.rej
*.dll*
*.exe
*.def
*.lib
*.pdb
*.mo
*.o
*.patch
*.pc
*.pot
*.so*
*.dylib
.*.swp
.depend
.DS_Store
TAGS
config.h
config.mak
config.log
x264_config.h
x264
checkasm
*.264
*.h264
*.2pass
*.ffindex
*.avs
*.mkv
*.flv
*.mp4
*.y4m
*.yuv
*.log
*.mbtree
*.temp
*.pyc
*.pgd
*.pgc
.digress_x264
dataDec.txt
log.dec
common/oclobj.h
x264_lookahead.clbin

339
.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,339 @@
stages:
- build
- test
- release
.variables-debian-amd64: &variables-debian-amd64
_TRIPLET: ""
_PLATFORMSUFFIX: ""
_WRAPPER: ""
.variables-debian-aarch64: &variables-debian-aarch64
_TRIPLET: ""
_PLATFORMSUFFIX: ""
_WRAPPER: ""
.variables-win32: &variables-win32
_TRIPLET: "i686-w64-mingw32"
_ARCH: "i686"
_OS: "mingw32"
_PLATFORMSUFFIX: ".exe"
_WRAPPER: "wine"
.variables-win64: &variables-win64
_TRIPLET: "x86_64-w64-mingw32"
_ARCH: "x86_64"
_OS: "mingw32"
_PLATFORMSUFFIX: ".exe"
_WRAPPER: "wine"
.variables-win-armv7: &variables-win-armv7
_TRIPLET: "armv7-w64-mingw32"
_PLATFORMSUFFIX: ".exe"
_WRAPPER: ""
.variables-win-aarch64: &variables-win-aarch64
_TRIPLET: "aarch64-w64-mingw32"
_PLATFORMSUFFIX: ".exe"
_WRAPPER: ""
.variables-macos-x86_64: &variables-macos-x86_64
_TRIPLET: "x86_64-apple-darwin19"
_ARCH: "x86_64"
_OS: "darwin"
_PLATFORMSUFFIX: ""
_WRAPPER: ""
_XCFLAGS: "-arch x86_64"
_XLDFLAGS: "-arch x86_64"
_BIN_PATH: /Users/videolanci/sandbox/bin
.variables-macos-arm64: &variables-macos-arm64
_TRIPLET: "aarch64-apple-darwin19"
_ARCH: "aarch64"
_OS: "darwin"
_PLATFORMSUFFIX: ""
_WRAPPER: ""
_XCFLAGS: "-arch arm64"
_XLDFLAGS: "-arch arm64"
_BIN_PATH: /Users/videolanci/sandbox/bin
.variables-android-arm: &variables-android-arm
_TRIPLET: "arm-linux-androideabi"
_CLANG_TRIPLET: "armv7a-linux-androideabi"
_ANDROID_VERSION: "21"
_PLATFORMSUFFIX: ""
_WRAPPER: ""
.variables-android-aarch64: &variables-android-aarch64
_TRIPLET: "aarch64-linux-android"
_CLANG_TRIPLET: "aarch64-linux-android"
_ANDROID_VERSION: "21"
_PLATFORMSUFFIX: ""
_WRAPPER: ""
.build:
stage: build
script: |
set -x
LOCAL_INSTALL_DIR=`pwd`/local_install
export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
cd ffmpeg
./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers --extra-ldflags="-static"
make -j$(getconf _NPROCESSORS_ONLN)
make -j$(getconf _NPROCESSORS_ONLN) install
cd ..
git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
cd lsmash
./configure --prefix="${LOCAL_INSTALL_DIR}" --extra-ldflags="-static"
make -j$(getconf _NPROCESSORS_ONLN)
make -j$(getconf _NPROCESSORS_ONLN) install
cd ..
./configure --enable-pic --enable-strip --extra-ldflags="-static"
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
artifacts:
name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA"
paths:
- x264${_PLATFORMSUFFIX}
- checkasm8${_PLATFORMSUFFIX}
- checkasm10${_PLATFORMSUFFIX}
- config.log
expire_in: 1 week
build-debian-amd64:
extends: .build
image: registry.videolan.org/vlc-debian-unstable:20240212151604
tags:
- docker
- amd64
variables: *variables-debian-amd64
build-debian-aarch64:
extends: .build
image: registry.videolan.org/x264-debian-unstable-aarch64:20211206141032
tags:
- docker
- aarch64
variables: *variables-debian-aarch64
.build-win:
extends: build-debian-amd64
image: registry.videolan.org/vlc-debian-llvm-msvcrt:20240212151604
script: |
set -x
LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET}
export PKGCONFIG=pkg-config
export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
cd ffmpeg
./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --cross-prefix="${_TRIPLET}-" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers
make -j$(getconf _NPROCESSORS_ONLN)
make -j$(getconf _NPROCESSORS_ONLN) install
cd ..
git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
cd lsmash
./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --cross-prefix="${_TRIPLET}-"
make -j$(getconf _NPROCESSORS_ONLN)
make -j$(getconf _NPROCESSORS_ONLN) install
cd ..
./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
build-win32:
extends: .build-win
variables: *variables-win32
build-win64:
extends: .build-win
variables: *variables-win64
.build-llvm-mingw:
extends: .build
image: registry.videolan.org/vlc-debian-llvm-ucrt:20240212151604
tags:
- docker
- amd64
script: |
set -x
PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
build-llvm-mingw-armv7:
extends: .build-llvm-mingw
variables: *variables-win-armv7
build-llvm-mingw-aarch64:
extends: .build-llvm-mingw
variables: *variables-win-aarch64
.build-macos:
extends: .build
script: |
set -x
export PATH="${_BIN_PATH}:$PATH"
LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET}
export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
cd ffmpeg
./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers
make -j$(getconf _NPROCESSORS_ONLN)
make -j$(getconf _NPROCESSORS_ONLN) install
cd ..
git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
cd lsmash
./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}"
make -j$(getconf _NPROCESSORS_ONLN)
make -j$(getconf _NPROCESSORS_ONLN) install
cd ..
./configure --host="${_TRIPLET}" --enable-pic --enable-strip
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
build-macos-x86_64:
extends: .build-macos
tags:
- amd64
- monterey
variables: *variables-macos-x86_64
build-macos-arm64:
extends: .build-macos
tags:
- amd64
- monterey
variables: *variables-macos-arm64
.build-android:
extends: .build
image: registry.videolan.org/vlc-debian-android:20241118101328
tags:
- docker
- amd64
script: |
set -x
CC=${_CLANG_TRIPLET}${_ANDROID_VERSION}-clang AR=llvm-ar RANLIB=llvm-ranlib STRIP=llvm-strip PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --enable-pic --enable-strip
make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
build-android-arm:
extends: .build-android
variables: *variables-android-arm
build-android-aarch64:
extends: .build-android
variables: *variables-android-aarch64
.test: &test
stage: test
script: |
set -x
${_WRAPPER} ./checkasm8${_PLATFORMSUFFIX}
${_WRAPPER} ./checkasm10${_PLATFORMSUFFIX}
artifacts:
expire_in: 10 minutes
test-debian-amd64:
<<: *test
extends: build-debian-amd64
dependencies:
- build-debian-amd64
variables: *variables-debian-amd64
test-debian-aarch64:
<<: *test
extends: build-debian-aarch64
dependencies:
- build-debian-aarch64
variables: *variables-debian-aarch64
test-win32:
<<: *test
extends: build-win32
dependencies:
- build-win32
variables: *variables-win32
test-win64:
<<: *test
extends: build-win64
dependencies:
- build-win64
variables: *variables-win64
test-macos-x86_64:
<<: *test
extends: build-macos-x86_64
dependencies:
- build-macos-x86_64
variables: *variables-macos-x86_64
test-aarch64-qemu:
<<: *test
extends: build-debian-amd64
image: registry.videolan.org/x264-debian-unstable:20231113190916
dependencies:
- build-debian-aarch64
variables: *variables-debian-amd64
script: |
set -x
for size in 128 256 512 1024 2048; do
for tool in checkasm8 checkasm10; do
qemu-aarch64 -cpu max,sve-default-vector-length=256,sve$size=on -L /usr/aarch64-linux-gnu ./$tool
done
done
.release: &release
stage: release
script: |
set -x
_VERSION=$(./version.sh | grep _VERSION -| cut -d\ -f4-| sed 's, ,-,g' | sed 's,",,')
mv x264${_PLATFORMSUFFIX} x264-${_VERSION}${_PLATFORMSUFFIX}
when: manual
only:
- master@videolan/x264
- stable@videolan/x264
artifacts:
name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA"
paths:
- x264-*${_PLATFORMSUFFIX}
expire_in: '10 minutes'
release-debian-amd64:
<<: *release
extends: build-debian-amd64
dependencies:
- build-debian-amd64
variables: *variables-debian-amd64
release-debian-aarch64:
<<: *release
extends: build-debian-aarch64
dependencies:
- build-debian-aarch64
variables: *variables-debian-aarch64
release-win32:
<<: *release
extends: build-win32
dependencies:
- build-win32
variables: *variables-win32
release-win64:
<<: *release
extends: build-win64
dependencies:
- build-win64
variables: *variables-win64
release-macos-x86_64:
<<: *release
extends: build-macos-x86_64
dependencies:
- build-macos-x86_64
variables: *variables-macos-x86_64
release-macos-arm64:
<<: *release
extends: build-macos-arm64
dependencies:
- build-macos-arm64
variables: *variables-macos-arm64

99
AUTHORS Normal file
View File

@@ -0,0 +1,99 @@
# Contributors to x264
#
# The format of this file was inspired by the Linux kernel CREDITS file.
# Authors are listed alphabetically.
#
# The fields are: name (N), email (E), web-address (W), CVS account login (C),
# PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
N: Alex Izvorski
E: aizvorski AT gmail DOT com
D: x86 asm (sse2)
N: Alex Wright
E: alexw0885 AT gmail DOT com
D: Motion estimation (subpel and mixed refs)
D: B-RDO
N: bobololo
D: Avisynth input
D: MP4 muxing
N: Christian Heine
E: sennindemokrit AT gmx DOT net
D: x86 asm
N: David Wolstencroft
D: Altivec optimizations
N: Eric Petit
E: eric.petit AT lapsus DOT org
C: titer
D: Altivec asm
D: BeOS and MacOS X ports.
S: France
N: Fiona Glaser
E: fiona AT x264 DOT com
D: Maintainer
D: All areas of encoder analysis and algorithms
D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
D: x86 asm
S: USA
N: Gabriel Bouvigne
E: bouvigne AT mp3-tech DOT org
D: 2pass VBV
N: Guillaume Poirier
E: gpoirier CHEZ mplayerhq POINT hu
D: Altivec optimizations
S: Brittany, France
N: Henrik Gramner
E: henrik AT gramner DOT com
D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes
S: Sweden
N: Laurent Aimar
E: fenrir AT videolan DOT org
C: fenrir
D: Initial import, former maintainer
D: x86 asm (mmx/mmx2)
S: France
N: Loren Merritt
E: pengvado AT akuvian DOT org
C: pengvado
D: Maintainer
D: All areas of encoder analysis and algorithms
D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
D: Multithreading
D: x86 asm
S: USA
N: Mans Rullgard
E: mru AT mansr DOT com
C: mru
D: Rate control
S: Southampton, UK
N: Michael Niedermayer
E: michaelni AT gmx DOT at
D: Rate control
N: Mike Matsnev
E: mike AT po DOT cs DOT msu DOT su
D: Matroska muxing
N: Min Chen
E: chenm001 AT 163 DOT com
C: chenm001
D: Win32/VC 6.0 port
D: gcc asm to nasm conversion
S: China
N: Radek Czyz
E: radoslaw AT syskin DOT cjb DOT net
D: Cached motion compensation

340
COPYING Normal file
View File

@@ -0,0 +1,340 @@
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Library General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.
<signature of Ty Coon>, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Library General
Public License instead of this License.

482
Makefile Normal file
View File

@@ -0,0 +1,482 @@
# Makefile
include config.mak
vpath %.c $(SRCPATH)
vpath %.h $(SRCPATH)
vpath %.S $(SRCPATH)
vpath %.asm $(SRCPATH)
vpath %.rc $(SRCPATH)
vpath %.manifest $(SRCPATH)
CFLAGS += $(CFLAGSPROF)
LDFLAGS += $(LDFLAGSPROF)
GENERATED =
all: default
default:
SRCS = common/osdep.c common/base.c common/cpu.c common/tables.c \
encoder/api.c
SRCS_X = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/frame.c common/dct.c common/cabac.c \
common/common.c common/rectangle.c \
common/set.c common/quant.c common/deblock.c common/vlc.c \
common/mvpred.c common/bitstream.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
SRCS_8 =
SRCCLI = x264.c autocomplete.c input/input.c input/timecode.c input/raw.c \
input/y4m.c output/raw.c output/matroska.c output/matroska_ebml.c \
output/flv.c output/flv_bytestream.c filters/filters.c \
filters/video/video.c filters/video/source.c filters/video/internal.c \
filters/video/resize.c filters/video/fix_vfr_pts.c \
filters/video/select_every.c filters/video/crop.c
SRCCLI_X = filters/video/cache.c filters/video/depth.c
SRCSO =
SRCCHK_X = tools/checkasm.c
SRCEXAMPLE = example.c
OBJS =
OBJASM =
OBJSO =
OBJCLI =
OBJCHK =
OBJCHK_8 =
OBJCHK_10 =
OBJEXAMPLE =
CONFIG := $(shell cat config.h)
# Optional module sources
ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),)
SRCCLI += input/avs.c
endif
ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),)
SRCS_X += common/threadpool.c
SRCCLI_X += input/thread.c
endif
ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),)
SRCS += common/win32thread.c
endif
ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),)
SRCCLI += input/lavf.c
endif
ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),)
SRCCLI += input/ffms.c
endif
ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),)
SRCCLI += output/mp4.c
endif
ifneq ($(findstring HAVE_LSMASH 1, $(CONFIG)),)
SRCCLI += output/mp4_lsmash.c
endif
ifneq ($(AS),)
# MMX/SSE optims
SRCASM_X =
ifeq ($(SYS_ARCH),X86)
ARCH_X86 = yes
SRCASM_X += common/x86/dct-32.asm \
common/x86/pixel-32.asm
endif
ifeq ($(SYS_ARCH),X86_64)
ARCH_X86 = yes
SRCASM_X += common/x86/dct-64.asm \
common/x86/trellis-64.asm
endif
ifdef ARCH_X86
SRCASM_X += common/x86/bitstream-a.asm \
common/x86/const-a.asm \
common/x86/cabac-a.asm \
common/x86/dct-a.asm \
common/x86/deblock-a.asm \
common/x86/mc-a.asm \
common/x86/mc-a2.asm \
common/x86/pixel-a.asm \
common/x86/predict-a.asm \
common/x86/quant-a.asm
SRCS_X += common/x86/mc-c.c \
common/x86/predict-c.c
OBJASM += common/x86/cpu-a.o
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.asm=%-8.o) common/x86/sad-a-8.o
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.asm=%-10.o) common/x86/sad16-a-10.o
endif
OBJCHK += tools/checkasm-a.o
endif
# AltiVec optims
ifeq ($(SYS_ARCH),PPC)
SRCS_X += common/ppc/dct.c \
common/ppc/deblock.c \
common/ppc/mc.c \
common/ppc/pixel.c \
common/ppc/predict.c \
common/ppc/quant.c
endif
# NEON optims
ifeq ($(SYS_ARCH),ARM)
SRCASM_X = common/arm/bitstream-a.S \
common/arm/dct-a.S \
common/arm/deblock-a.S \
common/arm/mc-a.S \
common/arm/pixel-a.S \
common/arm/predict-a.S \
common/arm/quant-a.S
SRCS_X += common/arm/mc-c.c \
common/arm/predict-c.c
OBJASM += common/arm/cpu-a.o
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-8.o)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-10.o)
endif
OBJCHK += tools/checkasm-arm.o
endif
# AArch64 NEON and SVE/SVE2 optims
ifeq ($(SYS_ARCH),AARCH64)
SRCASM_X = common/aarch64/bitstream-a.S \
common/aarch64/cabac-a.S \
common/aarch64/dct-a.S \
common/aarch64/deblock-a.S \
common/aarch64/mc-a.S \
common/aarch64/pixel-a.S \
common/aarch64/predict-a.S \
common/aarch64/quant-a.S
ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),)
SRCASM_X += common/aarch64/dct-a-sve.S \
common/aarch64/deblock-a-sve.S \
common/aarch64/mc-a-sve.S \
common/aarch64/pixel-a-sve.S
endif
ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),)
SRCASM_X += common/aarch64/dct-a-sve2.S
endif
SRCS_X += common/aarch64/asm-offsets.c \
common/aarch64/mc-c.c \
common/aarch64/predict-c.c
OBJASM +=
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-8.o)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-10.o)
endif
OBJCHK += tools/checkasm-aarch64.o
endif
# RISCV64 RVV optims
ifeq ($(SYS_ARCH),RISCV64)
ifneq ($(findstring HAVE_RVV 1, $(CONFIG)),)
SRCASM_X =
SRCS_X +=
OBJASM +=
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-8.o)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-10.o)
endif
OBJCHK +=
endif
endif
# MSA optims
ifeq ($(SYS_ARCH),MIPS)
ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
SRCS_X += common/mips/dct-c.c \
common/mips/deblock-c.c \
common/mips/mc-c.c \
common/mips/pixel-c.c \
common/mips/predict-c.c \
common/mips/quant-c.c
endif
endif
# LOONGARCH optimization
ifeq ($(SYS_ARCH),LOONGARCH)
ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),)
SRCASM_X += common/loongarch/deblock-a.S \
common/loongarch/sad-a.S \
common/loongarch/predict-a.S \
common/loongarch/quant-a.S \
common/loongarch/mc-a.S \
common/loongarch/dct-a.S \
common/loongarch/pixel-a.S
SRCS_X += common/loongarch/predict-c.c \
common/loongarch/mc-c.c \
common/loongarch/pixel-c.c
OBJASM +=
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-8.o)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJASM += $(SRCASM_X:%.S=%-10.o)
endif
OBJCHK += tools/checkasm-loongarch.o
endif
endif
endif
ifneq ($(HAVE_GETOPT_LONG),1)
SRCCLI += extras/getopt.c
endif
ifeq ($(SYS),WINDOWS)
OBJCLI += $(if $(RC), x264res.o)
ifneq ($(SONAME),)
SRCSO += x264dll.c
OBJSO += $(if $(RC), x264res.dll.o)
endif
endif
ifeq ($(HAVE_OPENCL),yes)
common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
cat $^ | $(SRCPATH)/tools/cltostr.sh $@
GENERATED += common/oclobj.h
SRCS_8 += common/opencl.c encoder/slicetype-cl.c
endif
OBJS += $(SRCS:%.c=%.o)
OBJCLI += $(SRCCLI:%.c=%.o)
OBJSO += $(SRCSO:%.c=%.o)
OBJEXAMPLE += $(SRCEXAMPLE:%.c=%.o)
ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
OBJS += $(SRCS_X:%.c=%-8.o) $(SRCS_8:%.c=%-8.o)
OBJCLI += $(SRCCLI_X:%.c=%-8.o)
OBJCHK_8 += $(SRCCHK_X:%.c=%-8.o)
checkasm: checkasm8$(EXE)
endif
ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
OBJS += $(SRCS_X:%.c=%-10.o)
OBJCLI += $(SRCCLI_X:%.c=%-10.o)
OBJCHK_10 += $(SRCCHK_X:%.c=%-10.o)
checkasm: checkasm10$(EXE)
endif
.PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* checkasm etags
cli: x264$(EXE)
lib-static: $(LIBX264)
lib-shared: $(SONAME)
$(LIBX264): $(OBJS) $(OBJASM)
rm -f $(LIBX264)
$(AR)$@ $(OBJS) $(OBJASM)
$(if $(RANLIB), $(RANLIB) $@)
$(SONAME): $(OBJS) $(OBJASM) $(OBJSO)
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
$(IMPLIBNAME): $(SONAME)
ifneq ($(EXE),)
.PHONY: x264 checkasm8 checkasm10 example
x264: x264$(EXE)
checkasm8: checkasm8$(EXE)
checkasm10: checkasm10$(EXE)
example: example$(EXE)
endif
x264$(EXE): $(OBJCLI) $(CLI_LIBX264)
$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
checkasm8$(EXE): $(OBJCHK) $(OBJCHK_8) $(LIBX264)
$(LD)$@ $(OBJCHK) $(OBJCHK_8) $(LIBX264) $(LDFLAGS)
checkasm10$(EXE): $(OBJCHK) $(OBJCHK_10) $(LIBX264)
$(LD)$@ $(OBJCHK) $(OBJCHK_10) $(LIBX264) $(LDFLAGS)
example$(EXE): $(OBJEXAMPLE) $(LIBX264)
$(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS)
$(OBJS) $(OBJSO): CFLAGS += $(CFLAGSSO)
$(OBJCLI): CFLAGS += $(CFLAGSCLI)
ALLOBJS = $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10) $(OBJEXAMPLE)
$(ALLOBJS): $(GENERATED)
%.o: %.c
$(DEPCMD)
$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS)
%-8.o: %.c
$(DEPCMD)
$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
%-10.o: %.c
$(DEPCMD)
$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d)
-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
%-8.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=8 -Dprivate_prefix=x264_8
-@ $(if $(STRIP), $(STRIP) -x $@)
%-10.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=10 -Dprivate_prefix=x264_10
-@ $(if $(STRIP), $(STRIP) -x $@)
%.o: %.S
$(DEPCMD)
$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS)
-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
%-8.o: %.S
$(DEPCMD)
$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
-@ $(if $(STRIP), $(STRIP) -x $@)
%-10.o: %.S
$(DEPCMD)
$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
-@ $(if $(STRIP), $(STRIP) -x $@)
%.dll.o: %.rc x264.h
$(RC) $(RCFLAGS)$@ -DDLL $<
%.o: %.rc x264.h x264res.manifest
$(RC) $(RCFLAGS)$@ $<
config.mak:
./configure
# This is kept as a no-op
depend:
@echo "make depend" is handled implicitly now
-include $(wildcard $(ALLOBJS:.o=.d))
# Dummy rule to avoid failing, if the dependency files specify dependencies on
# a removed .h file.
%.h:
@:
OBJPROF = $(OBJS) $(OBJSO) $(OBJCLI)
# These should cover most of the important codepaths
OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50
OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500
OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree
OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4
OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2
OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall
OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac
ifeq (,$(VIDS))
fprofiled:
@echo 'usage: make fprofiled VIDS="infile1 infile2 ..."'
@echo 'where infiles are anything that x264 understands,'
@echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.'
else
fprofiled: clean
$(MAKE) x264$(EXE) CFLAGSPROF="$(PROF_GEN_CC)" LDFLAGSPROF="$(PROF_GEN_LD)"
$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
ifeq ($(COMPILER),CL)
# Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted
rm -f x264$(EXE)
else
rm -f $(OBJPROF)
endif
$(MAKE) CFLAGSPROF="$(PROF_USE_CC)" LDFLAGSPROF="$(PROF_USE_LD)"
rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
endif
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(GENERATED) TAGS
rm -f $(SONAME) *.a *.lib *.exp *.pdb x264$(EXE) x264_lookahead.clbin
rm -f checkasm8$(EXE) checkasm10$(EXE) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10)
rm -f example$(EXE) $(OBJEXAMPLE)
rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
rm -f $(ALLOBJS:%.o=%.d)
distclean: clean
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
rm -rf conftest*
install-cli: cli
$(INSTALL) -d $(DESTDIR)$(bindir)
$(INSTALL) x264$(EXE) $(DESTDIR)$(bindir)
install-lib-dev:
$(INSTALL) -d $(DESTDIR)$(includedir)
$(INSTALL) -d $(DESTDIR)$(libdir)/pkgconfig
$(INSTALL) -m 644 $(SRCPATH)/x264.h x264_config.h $(DESTDIR)$(includedir)
$(INSTALL) -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
install-lib-static: lib-static install-lib-dev
$(INSTALL) -d $(DESTDIR)$(libdir)
$(INSTALL) -m 644 $(LIBX264) $(DESTDIR)$(libdir)
$(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264))
install-lib-shared: lib-shared install-lib-dev
$(INSTALL) -d $(DESTDIR)$(libdir)
ifneq ($(IMPLIBNAME),)
$(INSTALL) -d $(DESTDIR)$(bindir)
$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(bindir)
$(INSTALL) -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir)
else ifneq ($(SONAME),)
ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(libdir)
endif
install-bashcompletion:
ifneq ($(BASHCOMPLETIONSDIR),)
$(INSTALL) -d $(DESTDIR)$(BASHCOMPLETIONSDIR)
$(INSTALL) -m 644 $(SRCPATH)/tools/bash-autocomplete.sh $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264
endif
uninstall:
rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(includedir)/x264_config.h $(DESTDIR)$(libdir)/libx264.a
rm -f $(DESTDIR)$(bindir)/x264$(EXE) $(DESTDIR)$(libdir)/pkgconfig/x264.pc
ifneq ($(IMPLIBNAME),)
rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME)
else ifneq ($(SONAME),)
rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
endif
ifneq ($(BASHCOMPLETIONSDIR),)
rm -f $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264
endif
etags TAGS:
etags $(SRCS) $(SRCS_X) $(SRCS_8)

408
autocomplete.c Normal file
View File

@@ -0,0 +1,408 @@
/*****************************************************************************
* autocomplete: x264cli shell autocomplete
*****************************************************************************
* Copyright (C) 2018-2025 x264 project
*
* Authors: Henrik Gramner <henrik@gramner.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "x264cli.h"
#include "input/input.h"
#if HAVE_LAVF
#undef DECLARE_ALIGNED
#include <libavformat/avformat.h>
#include <libavutil/pixdesc.h>
#endif
static const char * const level_names[] =
{
"1", "1.1", "1.2", "1.3", "1b",
"2", "2.1", "2.2",
"3", "3.1", "3.2",
"4", "4.1", "4.2",
"5", "5.1", "5.2",
"6", "6.1", "6.2",
NULL
};
/* Options requiring a value for which we provide suggestions. */
static const char * const opts_suggest[] =
{
"--alternative-transfer",
"--aq-mode",
"--asm",
"--avcintra-class",
"--avcintra-flavor",
"--b-adapt",
"--b-pyramid",
"--colormatrix",
"--colorprim",
"--cqm",
"--demuxer",
"--direct",
"--frame-packing",
"--input-csp",
"--input-fmt",
"--input-range",
"--level",
"--log-level",
"--me",
"--muxer",
"--nal-hrd",
"--output-csp",
"--overscan",
"--pass", "-p",
"--preset",
"--profile",
"--pulldown",
"--range",
"--subme", "-m",
"--transfer",
"--trellis", "-t",
"--tune",
"--videoformat",
"--weightp",
NULL
};
/* Options requiring a value for which we don't provide suggestions. */
static const char * const opts_nosuggest[] =
{
"--b-bias",
"--bframes", "-b",
"--deblock", "-f",
"--bitrate", "-B",
"--chroma-qp-offset",
"--chromaloc",
"--cplxblur",
"--cqm4",
"--cqm4i",
"--cqm4ic",
"--cqm4iy",
"--cqm4p",
"--cqm4pc",
"--cqm4py",
"--cqm8",
"--cqm8i",
"--cqm8p",
"--crf",
"--crf-max",
"--crop-rect",
"--deadzone-inter",
"--deadzone-intra",
"--fps",
"--frames",
"--input-depth",
"--input-res",
"--ipratio",
"--keyint", "-I",
"--lookahead-threads",
"--mastering-display",
"--cll",
"--merange",
"--min-keyint", "-i",
"--mvrange",
"--mvrange-thread",
"--nr",
"--opencl-device",
"--output-depth",
"--partitions", "-A",
"--pbratio",
"--psy-rd",
"--qblur",
"--qcomp",
"--qp", "-q",
"--qpmax",
"--qpmin",
"--qpstep",
"--ratetol",
"--ref", "-r",
"--rc-lookahead",
"--sar",
"--scenecut",
"--seek",
"--slices",
"--slices-max",
"--slice-max-size",
"--slice-max-mbs",
"--slice-min-mbs",
"--sps-id",
"--sync-lookahead",
"--threads",
"--timebase",
"--vbv-bufsize",
"--vbv-init",
"--vbv-maxrate",
"--video-filter", "--vf",
"--zones",
NULL
};
/* Options requiring a filename. */
static const char * const opts_filename[] =
{
"--cqmfile",
"--dump-yuv",
"--index",
"--opencl-clbin",
"--output", "-o",
"--qpfile",
"--stats",
"--tcfile-in",
"--tcfile-out",
NULL
};
/* Options without an associated value. */
static const char * const opts_standalone[] =
{
"--8x8dct",
"--aud",
"--bff",
"--bluray-compat",
"--cabac",
"--constrained-intra",
"--cpu-independent",
"--dts-compress",
"--fake-interlaced",
"--fast-pskip",
"--filler",
"--force-cfr",
"--mbtree",
"--mixed-refs",
"--no-8x8dct",
"--no-asm",
"--no-cabac",
"--no-chroma-me",
"--no-dct-decimate",
"--no-deblock",
"--no-fast-pskip",
"--no-mbtree",
"--no-mixed-refs",
"--no-progress",
"--no-psy",
"--no-scenecut",
"--no-weightb",
"--non-deterministic",
"--open-gop",
"--opencl",
"--pic-struct",
"--psnr",
"--quiet",
"--sliced-threads",
"--slow-firstpass",
"--ssim",
"--stitchable",
"--tff",
"--thread-input",
"--verbose", "-v",
"--weightb",
NULL
};
/* Options which shouldn't be suggested in combination with other options. */
static const char * const opts_special[] =
{
"--fullhelp",
"--help", "-h",
"--longhelp",
"--version",
NULL
};
static int list_contains( const char * const *list, const char *s )
{
if( *s )
for( ; *list; list++ )
if( !strcmp( *list, s ) )
return 1;
return 0;
}
static void suggest( const char *s, const char *cur, int cur_len )
{
if( s && *s && !strncmp( s, cur, cur_len ) )
printf( "%s ", s );
}
static void suggest_lower( const char *s, const char *cur, int cur_len )
{
if( s && *s && !strncasecmp( s, cur, cur_len ) )
{
for( ; *s; s++ )
putchar( *s < 'A' || *s > 'Z' ? *s : *s | 0x20 );
putchar( ' ' );
}
}
static void suggest_num_range( int start, int end, const char *cur, int cur_len )
{
char buf[16];
for( int i = start; i <= end; i++ )
{
snprintf( buf, sizeof( buf ), "%d", i );
suggest( buf, cur, cur_len );
}
}
#if HAVE_LAVF
/* Suggest each token in a string separated by delimiters. */
static void suggest_token( const char *s, int delim, const char *cur, int cur_len )
{
if( s && *s )
{
for( const char *tok_end; (tok_end = strchr( s, delim )); s = tok_end + 1 )
{
int tok_len = tok_end - s;
if( tok_len && tok_len >= cur_len && !strncmp( s, cur, cur_len ) )
printf( "%.*s ", tok_len, s );
}
suggest( s, cur, cur_len );
}
}
#endif
#define OPT( opt ) else if( !strcmp( prev, opt ) )
#define OPT2( opt1, opt2 ) else if( !strcmp( prev, opt1 ) || !strcmp( prev, opt2 ) )
#define OPT_TYPE( type ) list_contains( opts_##type, prev )
#define suggest( s ) suggest( s, cur, cur_len )
#define suggest_lower( s ) suggest_lower( s, cur, cur_len )
#define suggest_list( list ) for( const char * const *s = list; *s; s++ ) suggest( *s )
#define suggest_num_range( start, end ) suggest_num_range( start, end, cur, cur_len )
#define suggest_token( s, delim ) suggest_token( s, delim, cur, cur_len )
int x264_cli_autocomplete( const char *prev, const char *cur )
{
int cur_len = strlen( cur );
if( 0 );
OPT( "--alternative-transfer" )
suggest_list( x264_transfer_names );
OPT( "--aq-mode" )
suggest_num_range( 0, 3 );
OPT( "--asm" )
for( const x264_cpu_name_t *cpu = x264_cpu_names; cpu->flags; cpu++ )
suggest_lower( cpu->name );
OPT( "--avcintra-class" )
suggest_list( x264_avcintra_class_names );
OPT( "--avcintra-flavor" )
suggest_list( x264_avcintra_flavor_names );
OPT( "--b-adapt" )
suggest_num_range( 0, 2 );
OPT( "--b-pyramid" )
suggest_list( x264_b_pyramid_names );
OPT( "--colormatrix" )
suggest_list( x264_colmatrix_names );
OPT( "--colorprim" )
suggest_list( x264_colorprim_names );
OPT( "--cqm" )
suggest_list( x264_cqm_names );
OPT( "--demuxer" )
suggest_list( x264_demuxer_names );
OPT( "--direct" )
suggest_list( x264_direct_pred_names );
OPT( "--frame-packing" )
suggest_num_range( 0, 7 );
OPT( "--input-csp" )
{
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
suggest( x264_cli_csps[i].name );
#if HAVE_LAVF
for( const AVPixFmtDescriptor *d = NULL; (d = av_pix_fmt_desc_next( d )); )
suggest( d->name );
#endif
}
OPT( "--input-fmt" )
{
#if HAVE_LAVF
void *i = NULL;
for( const AVInputFormat *f; (f = av_demuxer_iterate( &i )); )
suggest_token( f->name, ',' );
#endif
}
OPT( "--input-range" )
suggest_list( x264_range_names );
OPT( "--level" )
suggest_list( level_names );
OPT( "--log-level" )
suggest_list( x264_log_level_names );
OPT( "--me" )
suggest_list( x264_motion_est_names );
OPT( "--muxer" )
suggest_list( x264_muxer_names );
OPT( "--nal-hrd" )
suggest_list( x264_nal_hrd_names );
OPT( "--output-csp" )
suggest_list( x264_output_csp_names );
OPT( "--output-depth" )
{
#if HAVE_BITDEPTH8
suggest( "8" );
#endif
#if HAVE_BITDEPTH10
suggest( "10" );
#endif
}
OPT( "--overscan" )
suggest_list( x264_overscan_names );
OPT2( "--partitions", "-A" )
suggest_list( x264_partition_names );
OPT2( "--pass", "-p" )
suggest_num_range( 1, 3 );
OPT( "--preset" )
suggest_list( x264_preset_names );
OPT( "--profile" )
suggest_list( x264_valid_profile_names );
OPT( "--pulldown" )
suggest_list( x264_pulldown_names );
OPT( "--range" )
suggest_list( x264_range_names );
OPT2( "--subme", "-m" )
suggest_num_range( 0, 11 );
OPT( "--transfer" )
suggest_list( x264_transfer_names );
OPT2( "--trellis", "-t" )
suggest_num_range( 0, 2 );
OPT( "--tune" )
suggest_list( x264_tune_names );
OPT( "--videoformat" )
suggest_list( x264_vidformat_names );
OPT( "--weightp" )
suggest_num_range( 0, 2 );
else if( !OPT_TYPE( nosuggest ) && !OPT_TYPE( special ) )
{
if( OPT_TYPE( filename ) || strncmp( cur, "--", 2 ) )
return 1; /* Fall back to default shell filename autocomplete. */
/* Suggest options. */
suggest_list( opts_suggest );
suggest_list( opts_nosuggest );
suggest_list( opts_filename );
suggest_list( opts_standalone );
/* Only suggest special options if no other options have been specified. */
if( !*prev )
suggest_list( opts_special );
}
putchar( '\n' );
return 0;
}

View File

@@ -0,0 +1,56 @@
/*****************************************************************************
* asm-offsets.c: check asm offsets for aarch64
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "asm-offsets.h"
#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
{ \
STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
}
#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
{ \
STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
}
X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW);
X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE);
X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE);
X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
X264_CHECK_OFFSET(x264_cabac_t, p_start, CABAC_P_START);
X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P);
X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END);
X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED);
X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE);
// the aarch64 asm makes following additional assumptions about the x264_cabac_t
// memory layout
X264_CHECK_REL_OFFSET(x264_cabac_t, i_low, int, i_range);
X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue, int, i_bytes_outstanding);

View File

@@ -0,0 +1,39 @@
/*****************************************************************************
* asm-offsets.h: asm offsets for aarch64
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_ASM_OFFSETS_H
#define X264_AARCH64_ASM_OFFSETS_H
#define CABAC_I_LOW 0x00
#define CABAC_I_RANGE 0x04
#define CABAC_I_QUEUE 0x08
#define CABAC_I_BYTES_OUTSTANDING 0x0c
#define CABAC_P_START 0x10
#define CABAC_P 0x18
#define CABAC_P_END 0x20
#define CABAC_F8_BITS_ENCODED 0x30
#define CABAC_STATE 0x34
#endif

291
common/aarch64/asm.S Normal file
View File

@@ -0,0 +1,291 @@
/*****************************************************************************
* asm.S: AArch64 utility macros
*****************************************************************************
* Copyright (C) 2008-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "config.h"
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#ifdef PREFIX
# define BASE _x264_
# define SYM_PREFIX _
#else
# define BASE x264_
# define SYM_PREFIX
#endif
#ifdef BIT_DEPTH
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
#else
# define EXTERN_ASM BASE
#endif
#define X(s) JOIN(EXTERN_ASM, s)
#define X264(s) JOIN(BASE, s)
#define EXT(s) JOIN(SYM_PREFIX, s)
#ifdef __ELF__
# define ELF
#else
# define ELF #
#endif
#ifdef __MACH__
# define MACH
#else
# define MACH #
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
# define FUNC #
#endif
.arch AS_ARCH_LEVEL
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
#define ENABLE_DOTPROD .arch_extension dotprod
#define DISABLE_DOTPROD .arch_extension nodotprod
#else
#define ENABLE_DOTPROD
#define DISABLE_DOTPROD
#endif
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
#define ENABLE_I8MM .arch_extension i8mm
#define DISABLE_I8MM .arch_extension noi8mm
#else
#define ENABLE_I8MM
#define DISABLE_I8MM
#endif
#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
#define ENABLE_SVE .arch_extension sve
#define DISABLE_SVE .arch_extension nosve
#else
#define ENABLE_SVE
#define DISABLE_SVE
#endif
#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
#define ENABLE_SVE2 .arch_extension sve2
#define DISABLE_SVE2 .arch_extension nosve2
#else
#define ENABLE_SVE2
#define DISABLE_SVE2
#endif
/* If we do support the .arch_extension directives, disable support for all
* the extensions that we may use, in case they were implicitly enabled by
* the .arch level. This makes it clear if we try to assemble an instruction
* from an unintended extension set; we only allow assmbling such instructions
* within regions where we explicitly enable those extensions. */
DISABLE_DOTPROD
DISABLE_I8MM
DISABLE_SVE
DISABLE_SVE2
.macro function name, export=0, align=2
.macro endfunc
.if \export
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
.else
ELF .size \name, . - \name
.endif
FUNC .endfunc
.purgem endfunc
.endm
.text
.align \align
.if \export
.global EXTERN_ASM\name
ELF .type EXTERN_ASM\name, %function
FUNC .func EXTERN_ASM\name
EXTERN_ASM\name:
.else
ELF .type \name, %function
FUNC .func \name
\name:
.endif
.endm
.macro const name, align=2
.macro endconst
ELF .size \name, . - \name
.purgem endconst
.endm
ELF .section .rodata
MACH .const_data
.align \align
\name:
.endm
.macro movrel rd, val, offset=0
#if defined(__APPLE__)
.if \offset < 0
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
.endif
#elif defined(PIC) && defined(_WIN32)
.if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
#elif defined(PIC)
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
#else
ldr \rd, =\val+\offset
#endif
.endm
#define FDEC_STRIDE 32
#define FENC_STRIDE 16
.macro SUMSUB_AB sum, sub, a, b
add \sum, \a, \b
sub \sub, \a, \b
.endm
.macro unzip t1, t2, s1, s2
uzp1 \t1, \s1, \s2
uzp2 \t2, \s1, \s2
.endm
.macro transpose t1, t2, s1, s2
trn1 \t1, \s1, \s2
trn2 \t2, \s1, \s2
.endm
.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s
transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s
transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h
transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h
.endm
.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s
transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s
transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h
transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h
.endm
.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8h, \r0\().8h, \r1\().8h
trn2 \r9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4s, \r9\().4s, \r3\().4s
trn2 \r9\().4s, \r9\().4s, \r3\().4s
trn1 \r3\().4s, \r8\().4s, \r1\().4s
trn2 \r8\().4s, \r8\().4s, \r1\().4s
trn1 \r0\().2d, \r3\().2d, \r4\().2d
trn2 \r4\().2d, \r3\().2d, \r4\().2d
trn1 \r1\().2d, \r5\().2d, \r6\().2d
trn2 \r5\().2d, \r5\().2d, \r6\().2d
trn2 \r6\().2d, \r8\().2d, \r2\().2d
trn1 \r2\().2d, \r8\().2d, \r2\().2d
trn1 \r3\().2d, \r9\().2d, \r7\().2d
trn2 \r7\().2d, \r9\().2d, \r7\().2d
.endm
.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
trn1 \t0\().16b, \r0\().16b, \r1\().16b
trn2 \t1\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8h, \t1\().8h, \r3\().8h
trn2 \t1\().8h, \t1\().8h, \r3\().8h
trn1 \r3\().8h, \t0\().8h, \r1\().8h
trn2 \t0\().8h, \t0\().8h, \r1\().8h
trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4s, \t0\().4s, \r2\().4s
trn1 \r2\().4s, \t0\().4s, \r2\().4s
trn1 \r3\().4s, \t1\().4s, \r7\().4s
trn2 \r7\().4s, \t1\().4s, \r7\().4s
.endm
.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
.macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8b, \r0\().8b, \r1\().8b
trn2 \t5\().8b, \r0\().8b, \r1\().8b
trn1 \t6\().8b, \r2\().8b, \r3\().8b
trn2 \t7\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().4h, \t4\().4h, \t6\().4h
trn2 \r2\().4h, \t4\().4h, \t6\().4h
trn1 \r1\().4h, \t5\().4h, \t7\().4h
trn2 \r3\().4h, \t5\().4h, \t7\().4h
.endm

View File

@@ -0,0 +1,82 @@
/*****************************************************************************
* bitstream-a.S: aarch64 bitstream functions
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
function nal_escape_neon, export=1
movi v0.16b, #0xff
movi v4.16b, #4
mov w3, #3
subs x6, x1, x2
cbz x6, 99f
0:
cmn x6, #15
b.lt 16f
mov x1, x2
b 100f
16:
ld1 {v1.16b}, [x1], #16
ext v2.16b, v0.16b, v1.16b, #14
ext v3.16b, v0.16b, v1.16b, #15
cmhi v7.16b, v4.16b, v1.16b
cmeq v5.16b, v2.16b, #0
cmeq v6.16b, v3.16b, #0
and v5.16b, v5.16b, v7.16b
and v5.16b, v5.16b, v6.16b
shrn v7.8b, v5.8h, #4
mov x7, v7.d[0]
cbz x7, 16f
mov x6, #-16
100:
umov w5, v0.b[14]
umov w4, v0.b[15]
orr w5, w4, w5, lsl #8
101:
ldrb w4, [x1, x6]
orr w9, w4, w5, lsl #16
cmp w9, #3
b.hi 102f
strb w3, [x0], #1
orr w5, w3, w5, lsl #8
102:
adds x6, x6, #1
strb w4, [x0], #1
orr w5, w4, w5, lsl #8
b.lt 101b
subs x6, x1, x2
lsr w9, w5, #8
mov v0.b[14], w9
mov v0.b[15], w5
b.lt 0b
ret
16:
subs x6, x1, x2
st1 {v1.16b}, [x0], #16
mov v0.16b, v1.16b
b.lt 0b
99:
ret
endfunc

View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* bitstream.h: aarch64 bitstream functions
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_BITSTREAM_H
#define X264_AARCH64_BITSTREAM_H
#define x264_nal_escape_neon x264_template(nal_escape_neon)
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif

131
common/aarch64/cabac-a.S Normal file
View File

@@ -0,0 +1,131 @@
/*****************************************************************************
* cabac-a.S: aarch64 cabac
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "asm-offsets.h"
// w11 holds x264_cabac_t.i_low
// w12 holds x264_cabac_t.i_range
function cabac_encode_decision_asm, export=1
add w10, w1, #CABAC_STATE
ldrb w3, [x0, w10, uxtw] // i_state
ldr w12, [x0, #CABAC_I_RANGE]
movrel x8, X264(cabac_range_lps), -4
movrel x9, X264(cabac_transition)
ubfx x4, x3, #1, #7
asr w5, w12, #6
add x8, x8, x4, lsl #2
orr w14, w2, w3, lsl #1
ldrb w4, [x8, w5, uxtw] // i_range_lps
ldr w11, [x0, #CABAC_I_LOW]
eor w6, w2, w3 // b ^ i_state
ldrb w9, [x9, w14, uxtw]
sub w12, w12, w4
add w7, w11, w12
tst w6, #1 // (b ^ i_state) & 1
csel w12, w4, w12, ne
csel w11, w7, w11, ne
strb w9, [x0, w10, uxtw] // i_state
cabac_encode_renorm:
ldr w2, [x0, #CABAC_I_QUEUE]
clz w5, w12
sub w5, w5, #23
lsl w11, w11, w5
lsl w12, w12, w5
adds w2, w2, w5
b.ge cabac_putbyte
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
str w2, [x0, #CABAC_I_QUEUE]
ret
.align 5
cabac_putbyte:
ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
add w14, w2, #10
mov w13, #-1
sub w2, w2, #8
asr w4, w11, w14 // out
lsl w13, w13, w14
subs w5, w4, #0xff
bic w11, w11, w13
cinc w6, w6, eq
b.eq 0f
1:
ldr x7, [x0, #CABAC_P]
asr w5, w4, #8 // carry
ldurb w8, [x7, #-1]
add w8, w8, w5
sub w5, w5, #1
sturb w8, [x7, #-1]
cbz w6, 3f
2:
subs w6, w6, #1
strb w5, [x7], #1
b.gt 2b
3:
strb w4, [x7], #1
str x7, [x0, #CABAC_P]
0:
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
stp w2, w6, [x0, #CABAC_I_QUEUE] // store i_queue, i_bytes_outstanding
ret
endfunc
function cabac_encode_bypass_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
ldr w11, [x0, #CABAC_I_LOW]
ldr w2, [x0, #CABAC_I_QUEUE]
and w1, w1, w12
add w11, w1, w11, lsl #1
adds w2, w2, #1
b.ge cabac_putbyte
str w11, [x0, #CABAC_I_LOW]
str w2, [x0, #CABAC_I_QUEUE]
ret
endfunc
function cabac_encode_terminal_asm, export=1, align=5
ldr w12, [x0, #CABAC_I_RANGE]
sub w12, w12, #2
tbz w12, #8, 1f
str w12, [x0, #CABAC_I_RANGE]
ret
1:
ldr w2, [x0, #CABAC_I_QUEUE]
ldr w11, [x0, #CABAC_I_LOW]
lsl w12, w12, #1
adds w2, w2, #1
lsl w11, w11, #1
b.ge cabac_putbyte
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
str w2, [x0, #CABAC_I_QUEUE]
ret
endfunc

View File

@@ -0,0 +1,40 @@
/****************************************************************************
* dct-a-common.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros that are intended to be used by
// the SVE/SVE2 functions as well
.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
SUMSUB_AB \v1, \v6, \v5, \v6
SUMSUB_AB \v3, \v7, \v4, \v7
add \v0, \v3, \v1
add \v4, \v7, \v7
add \v5, \v6, \v6
sub \v2, \v3, \v1
add \v1, \v4, \v6
sub \v3, \v7, \v5
.endm

View File

@@ -0,0 +1,88 @@
/****************************************************************************
* dct-a-sve.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "dct-a-common.S"
ENABLE_SVE
function sub4x4_dct_sve, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
ptrue p0.h, vl4
ld1b {z0.h}, p0/z, [x1]
add x1, x1, x3
ld1b {z1.h}, p0/z, [x2]
add x2, x2, x4
ld1b {z2.h}, p0/z, [x1]
add x1, x1, x3
sub v16.4h, v0.4h, v1.4h
ld1b {z3.h}, p0/z, [x2]
add x2, x2, x4
ld1b {z4.h}, p0/z, [x1]
add x1, x1, x3
sub v17.4h, v2.4h, v3.4h
ld1b {z5.h}, p0/z, [x2]
add x2, x2, x4
ld1b {z6.h}, p0/z, [x1]
sub v18.4h, v4.4h, v5.4h
ld1b {z7.h}, p0/z, [x2]
sub v19.4h, v6.4h, v7.4h
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
ret
endfunc
function zigzag_interleave_8x8_cavlc_sve, export=1
mov z31.s, #1
ptrue p2.s, vl2
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
umax v16.8h, v0.8h, v4.8h
umax v17.8h, v1.8h, v5.8h
umax v18.8h, v2.8h, v6.8h
umax v19.8h, v3.8h, v7.8h
st1 {v0.8h}, [x0], #16
st1 {v4.8h}, [x0], #16
umaxp v16.8h, v16.8h, v17.8h
umaxp v18.8h, v18.8h, v19.8h
st1 {v1.8h}, [x0], #16
st1 {v5.8h}, [x0], #16
umaxp v16.8h, v16.8h, v18.8h
st1 {v2.8h}, [x0], #16
st1 {v6.8h}, [x0], #16
cmhs v16.4s, v16.4s, v31.4s
st1 {v3.8h}, [x0], #16
and v16.16b, v16.16b, v31.16b
st1 {v7.8h}, [x0], #16
st1b {z16.s}, p2, [x2]
add x2, x2, #8
mov v16.d[0], v16.d[1]
st1b {z16.s}, p2, [x2]
ret
endfunc

View File

@@ -0,0 +1,90 @@
/****************************************************************************
* dct-a-sve2.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "dct-a-common.S"
ENABLE_SVE
ENABLE_SVE2
function add4x4_idct_sve2, export=1
mov x2, #FDEC_STRIDE
mov x11, x0
ptrue p0.h, vl8
ptrue p1.h, vl4
ld1 {v0.8h, v1.8h}, [x1]
SUMSUB_AB v4.8h, v5.8h, v0.8h, v1.8h
sshr v7.8h, v0.8h, #1
sshr v6.8h, v1.8h, #1
sub v7.8h, v7.8h, v1.8h
add v6.8h, v6.8h, v0.8h
mov v7.d[0], v7.d[1]
mov v6.d[0], v6.d[1]
ld1b {z28.h}, p0/z, [x11]
add x11, x11, x2
SUMSUB_AB v0.8h, v2.8h, v4.8h, v6.8h
SUMSUB_AB v1.8h, v3.8h, v5.8h, v7.8h
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
SUMSUB_AB v4.4h, v5.4h, v0.4h, v3.4h
sshr v7.4h, v1.4h, #1
sshr v6.4h, v2.4h, #1
sub v7.4h, v7.4h, v2.4h
add v6.4h, v6.4h, v1.4h
ld1b {z29.h}, p0/z, [x11]
add x11, x11, x2
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
srshr z0.h, p1/m, z0.h, #6
srshr z1.h, p1/m, z1.h, #6
ld1b {z31.h}, p0/z, [x11]
add x11, x11, x2
srshr z2.h, p1/m, z2.h, #6
srshr z3.h, p1/m, z3.h, #6
ld1b {z30.h}, p0/z, [x11]
add v0.8h, v0.8h, v28.8h
add v1.8h, v1.8h, v29.8h
add v2.8h, v2.8h, v30.8h
add v3.8h, v3.8h, v31.8h
sqxtunb z0.b, z0.h
sqxtunb z1.b, z1.h
sqxtunb z2.b, z2.h
sqxtunb z3.b, z3.h
st1b {z0.h}, p1, [x0]
add x0, x0, x2
st1b {z1.h}, p1, [x0]
add x0, x0, x2
st1b {z3.h}, p1, [x0]
add x0, x0, x2
st1b {z2.h}, p1, [x0]
ret
endfunc

998
common/aarch64/dct-a.S Normal file
View File

@@ -0,0 +1,998 @@
/****************************************************************************
* dct-a.S: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "dct-a-common.S"
const scan4x4_frame, align=4
.byte 0,1, 8,9, 2,3, 4,5
.byte 10,11, 16,17, 24,25, 18,19
.byte 12,13, 6,7, 14,15, 20,21
.byte 26,27, 28,29, 22,23, 30,31
endconst
const scan4x4_field, align=4
.byte 0,1, 2,3, 8,9, 4,5
.byte 6,7, 10,11, 12,13, 14,15
endconst
const sub4x4_frame, align=4
.byte 0, 1, 4, 8
.byte 5, 2, 3, 6
.byte 9, 12, 13, 10
.byte 7, 11, 14, 15
endconst
const sub4x4_field, align=4
.byte 0, 4, 1, 8
.byte 12, 5, 9, 13
.byte 2, 6, 10, 14
.byte 3, 7, 11, 15
endconst
// sum = a + (b>>shift) sub = (a>>shift) - b
.macro SUMSUB_SHR shift sum sub a b t0 t1
sshr \t0, \b, #\shift
sshr \t1, \a, #\shift
add \sum, \a, \t0
sub \sub, \t1, \b
.endm
// sum = (a>>shift) + b sub = a - (b>>shift)
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
sshr \t0, \a, #\shift
sshr \t1, \b, #\shift
add \sum, \t0, \b
sub \sub, \a, \t1
.endm
// a += 1.5*ma b -= 1.5*mb
.macro SUMSUB_15 a b ma mb t0 t1
sshr \t0, \ma, #1
sshr \t1, \mb, #1
add \t0, \t0, \ma
add \t1, \t1, \mb
add \a, \a, \t0
sub \b, \b, \t1
.endm
function dct4x4dc_neon, export=1
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
movi v31.4h, #1
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
transpose v4.4h, v6.4h, v0.4h, v2.4h
transpose v5.4h, v7.4h, v1.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
transpose v4.2s, v5.2s, v0.2s, v1.2s
transpose v6.2s, v7.2s, v2.2s, v3.2s
add v16.4h, v4.4h, v31.4h
add v17.4h, v6.4h, v31.4h
srhadd v0.4h, v4.4h, v5.4h
shsub v1.4h, v16.4h, v5.4h
shsub v2.4h, v17.4h, v7.4h
srhadd v3.4h, v6.4h, v7.4h
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
ret
endfunc
function idct4x4dc_neon, export=1
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
transpose v4.4h, v6.4h, v0.4h, v2.4h
transpose v5.4h, v7.4h, v1.4h, v3.4h
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
transpose v4.2s, v5.2s, v0.2s, v1.2s
transpose v6.2s, v7.2s, v2.2s, v3.2s
SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h
SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
ret
endfunc
function sub4x4_dct_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
ld1 {v0.s}[0], [x1], x3
ld1 {v1.s}[0], [x2], x4
ld1 {v2.s}[0], [x1], x3
usubl v16.8h, v0.8b, v1.8b
ld1 {v3.s}[0], [x2], x4
ld1 {v4.s}[0], [x1], x3
usubl v17.8h, v2.8b, v3.8b
ld1 {v5.s}[0], [x2], x4
ld1 {v6.s}[0], [x1], x3
usubl v18.8h, v4.8b, v5.8b
ld1 {v7.s}[0], [x2], x4
usubl v19.8h, v6.8b, v7.8b
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
ret
endfunc
function sub8x4_dct_neon
ld1 {v0.8b}, [x1], x3
ld1 {v1.8b}, [x2], x4
usubl v16.8h, v0.8b, v1.8b
ld1 {v2.8b}, [x1], x3
ld1 {v3.8b}, [x2], x4
usubl v17.8h, v2.8b, v3.8b
ld1 {v4.8b}, [x1], x3
ld1 {v5.8b}, [x2], x4
usubl v18.8h, v4.8b, v5.8b
ld1 {v6.8b}, [x1], x3
ld1 {v7.8b}, [x2], x4
usubl v19.8h, v6.8b, v7.8b
DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h
SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h
add v22.8h, v19.8h, v19.8h
add v21.8h, v18.8h, v18.8h
add v0.8h, v16.8h, v17.8h
sub v1.8h, v16.8h, v17.8h
add v2.8h, v22.8h, v18.8h
sub v3.8h, v19.8h, v21.8h
zip1 v4.2d, v0.2d, v2.2d
zip2 v6.2d, v0.2d, v2.2d
zip1 v5.2d, v1.2d, v3.2d
zip2 v7.2d, v1.2d, v3.2d
st1 {v4.8h}, [x0], #16
st1 {v5.8h}, [x0], #16
st1 {v6.8h}, [x0], #16
st1 {v7.8h}, [x0], #16
ret
endfunc
function sub8x8_dct_neon, export=1
mov x5, x30
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
bl sub8x4_dct_neon
mov x30, x5
b sub8x4_dct_neon
endfunc
function sub16x16_dct_neon, export=1
mov x5, x30
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub x1, x1, #8*FENC_STRIDE-8
sub x2, x2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub x1, x1, #8
sub x2, x2, #8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub x1, x1, #8*FENC_STRIDE-8
sub x2, x2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
mov x30, x5
b sub8x4_dct_neon
endfunc
.macro DCT8_1D type
SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34
SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25
SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16
SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07
SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2
SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3
SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5
sshr v23.8h, v21.8h, #1
sshr v18.8h, v16.8h, #1
add v23.8h, v23.8h, v21.8h
add v18.8h, v18.8h, v16.8h
sub v30.8h, v30.8h, v23.8h
sub v29.8h, v29.8h, v18.8h
SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7
sshr v22.8h, v20.8h, #1
sshr v19.8h, v17.8h, #1
add v22.8h, v22.8h, v20.8h
add v19.8h, v19.8h, v17.8h
add v22.8h, v28.8h, v22.8h
add v31.8h, v31.8h, v19.8h
SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h
SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h
SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h
SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h
.endm
function sub8x8_dct8_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
ld1 {v16.8b}, [x1], x3
ld1 {v17.8b}, [x2], x4
ld1 {v18.8b}, [x1], x3
ld1 {v19.8b}, [x2], x4
usubl v0.8h, v16.8b, v17.8b
ld1 {v20.8b}, [x1], x3
ld1 {v21.8b}, [x2], x4
usubl v1.8h, v18.8b, v19.8b
ld1 {v22.8b}, [x1], x3
ld1 {v23.8b}, [x2], x4
usubl v2.8h, v20.8b, v21.8b
ld1 {v24.8b}, [x1], x3
ld1 {v25.8b}, [x2], x4
usubl v3.8h, v22.8b, v23.8b
ld1 {v26.8b}, [x1], x3
ld1 {v27.8b}, [x2], x4
usubl v4.8h, v24.8b, v25.8b
ld1 {v28.8b}, [x1], x3
ld1 {v29.8b}, [x2], x4
usubl v5.8h, v26.8b, v27.8b
ld1 {v30.8b}, [x1], x3
ld1 {v31.8b}, [x2], x4
usubl v6.8h, v28.8b, v29.8b
usubl v7.8h, v30.8b, v31.8b
DCT8_1D row
transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
DCT8_1D col
st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
ret
endfunc
function sub16x16_dct8_neon, export=1
mov x7, x30
bl X(sub8x8_dct8_neon)
sub x1, x1, #FENC_STRIDE*8 - 8
sub x2, x2, #FDEC_STRIDE*8 - 8
bl X(sub8x8_dct8_neon)
sub x1, x1, #8
sub x2, x2, #8
bl X(sub8x8_dct8_neon)
mov x30, x7
sub x1, x1, #FENC_STRIDE*8 - 8
sub x2, x2, #FDEC_STRIDE*8 - 8
b X(sub8x8_dct8_neon)
endfunc
// First part of IDCT (minus final SUMSUB_BA)
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
SUMSUB_AB \d4, \d5, \d0, \d2
sshr \d7, \d1, #1
sshr \d6, \d3, #1
sub \d7, \d7, \d3
add \d6, \d6, \d1
.endm
function add4x4_idct_neon, export=1
mov x2, #FDEC_STRIDE
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
ld1 {v28.s}[0], [x0], x2
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
ld1 {v29.s}[0], [x0], x2
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
srshr v0.4h, v0.4h, #6
srshr v1.4h, v1.4h, #6
ld1 {v31.s}[0], [x0], x2
srshr v2.4h, v2.4h, #6
srshr v3.4h, v3.4h, #6
ld1 {v30.s}[0], [x0], x2
sub x0, x0, x2, lsl #2
uaddw v0.8h, v0.8h, v28.8b
uaddw v1.8h, v1.8h, v29.8b
uaddw v2.8h, v2.8h, v30.8b
uaddw v3.8h, v3.8h, v31.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
st1 {v0.s}[0], [x0], x2
st1 {v1.s}[0], [x0], x2
st1 {v3.s}[0], [x0], x2
st1 {v2.s}[0], [x0], x2
ret
endfunc
function add8x4_idct_neon, export=1
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
transpose v20.2d, v21.2d, v0.2d, v2.2d
transpose v22.2d, v23.2d, v1.2d, v3.2d
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
srshr v0.8h, v0.8h, #6
ld1 {v28.8b}, [x0], x2
srshr v1.8h, v1.8h, #6
ld1 {v29.8b}, [x0], x2
srshr v2.8h, v2.8h, #6
ld1 {v30.8b}, [x0], x2
srshr v3.8h, v3.8h, #6
ld1 {v31.8b}, [x0], x2
sub x0, x0, x2, lsl #2
uaddw v0.8h, v0.8h, v28.8b
uaddw v1.8h, v1.8h, v29.8b
uaddw v2.8h, v2.8h, v30.8b
uaddw v3.8h, v3.8h, v31.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
st1 {v0.8b}, [x0], x2
sqxtun v2.8b, v2.8h
st1 {v1.8b}, [x0], x2
sqxtun v3.8b, v3.8h
st1 {v2.8b}, [x0], x2
st1 {v3.8b}, [x0], x2
ret
endfunc
function add8x8_idct_neon, export=1
mov x2, #FDEC_STRIDE
mov x5, x30
bl X(add8x4_idct_neon)
mov x30, x5
b X(add8x4_idct_neon)
endfunc
function add16x16_idct_neon, export=1
mov x2, #FDEC_STRIDE
mov x5, x30
bl X(add8x4_idct_neon)
bl X(add8x4_idct_neon)
sub x0, x0, #8*FDEC_STRIDE-8
bl X(add8x4_idct_neon)
bl X(add8x4_idct_neon)
sub x0, x0, #8
bl X(add8x4_idct_neon)
bl X(add8x4_idct_neon)
sub x0, x0, #8*FDEC_STRIDE-8
bl X(add8x4_idct_neon)
mov x30, x5
b X(add8x4_idct_neon)
endfunc
.macro IDCT8_1D type
SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2
.ifc \type, row
ld1 {v22.8h,v23.8h}, [x1], #32
.endif
SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4
SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h
SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1
SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h
SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3
SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5
SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7
SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6
SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4
SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h
SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h
SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h
SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h
.endm
function add8x8_idct8_neon, export=1
mov x2, #FDEC_STRIDE
ld1 {v16.8h,v17.8h}, [x1], #32
ld1 {v18.8h,v19.8h}, [x1], #32
ld1 {v20.8h,v21.8h}, [x1], #32
IDCT8_1D row
transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
IDCT8_1D col
ld1 {v0.8b}, [x0], x2
srshr v16.8h, v16.8h, #6
ld1 {v1.8b}, [x0], x2
srshr v17.8h, v17.8h, #6
ld1 {v2.8b}, [x0], x2
srshr v18.8h, v18.8h, #6
ld1 {v3.8b}, [x0], x2
srshr v19.8h, v19.8h, #6
ld1 {v4.8b}, [x0], x2
srshr v20.8h, v20.8h, #6
ld1 {v5.8b}, [x0], x2
srshr v21.8h, v21.8h, #6
ld1 {v6.8b}, [x0], x2
srshr v22.8h, v22.8h, #6
ld1 {v7.8b}, [x0], x2
srshr v23.8h, v23.8h, #6
sub x0, x0, x2, lsl #3
uaddw v16.8h, v16.8h, v0.8b
uaddw v17.8h, v17.8h, v1.8b
uaddw v18.8h, v18.8h, v2.8b
sqxtun v0.8b, v16.8h
sqxtun v1.8b, v17.8h
sqxtun v2.8b, v18.8h
uaddw v19.8h, v19.8h, v3.8b
st1 {v0.8b}, [x0], x2
uaddw v20.8h, v20.8h, v4.8b
st1 {v1.8b}, [x0], x2
uaddw v21.8h, v21.8h, v5.8b
st1 {v2.8b}, [x0], x2
sqxtun v3.8b, v19.8h
sqxtun v4.8b, v20.8h
uaddw v22.8h, v22.8h, v6.8b
uaddw v23.8h, v23.8h, v7.8b
st1 {v3.8b}, [x0], x2
sqxtun v5.8b, v21.8h
st1 {v4.8b}, [x0], x2
sqxtun v6.8b, v22.8h
sqxtun v7.8b, v23.8h
st1 {v5.8b}, [x0], x2
st1 {v6.8b}, [x0], x2
st1 {v7.8b}, [x0], x2
ret
endfunc
function add16x16_idct8_neon, export=1
mov x7, x30
bl X(add8x8_idct8_neon)
sub x0, x0, #8*FDEC_STRIDE-8
bl X(add8x8_idct8_neon)
sub x0, x0, #8
bl X(add8x8_idct8_neon)
sub x0, x0, #8*FDEC_STRIDE-8
mov x30, x7
b X(add8x8_idct8_neon)
endfunc
function add8x8_idct_dc_neon, export=1
mov x2, #FDEC_STRIDE
ld1 {v16.4h}, [x1]
ld1 {v0.8b}, [x0], x2
srshr v16.4h, v16.4h, #6
ld1 {v1.8b}, [x0], x2
dup v20.8h, v16.h[0]
dup v21.8h, v16.h[1]
ld1 {v2.8b}, [x0], x2
dup v22.8h, v16.h[2]
dup v23.8h, v16.h[3]
ld1 {v3.8b}, [x0], x2
trn1 v20.2d, v20.2d, v21.2d
ld1 {v4.8b}, [x0], x2
trn1 v21.2d, v22.2d, v23.2d
ld1 {v5.8b}, [x0], x2
neg v22.8h, v20.8h
ld1 {v6.8b}, [x0], x2
neg v23.8h, v21.8h
ld1 {v7.8b}, [x0], x2
sub x0, x0, #8*FDEC_STRIDE
sqxtun v20.8b, v20.8h
sqxtun v21.8b, v21.8h
sqxtun v22.8b, v22.8h
sqxtun v23.8b, v23.8h
uqadd v0.8b, v0.8b, v20.8b
uqadd v1.8b, v1.8b, v20.8b
uqadd v2.8b, v2.8b, v20.8b
uqadd v3.8b, v3.8b, v20.8b
uqadd v4.8b, v4.8b, v21.8b
uqadd v5.8b, v5.8b, v21.8b
uqadd v6.8b, v6.8b, v21.8b
uqadd v7.8b, v7.8b, v21.8b
uqsub v0.8b, v0.8b, v22.8b
uqsub v1.8b, v1.8b, v22.8b
uqsub v2.8b, v2.8b, v22.8b
uqsub v3.8b, v3.8b, v22.8b
uqsub v4.8b, v4.8b, v23.8b
uqsub v5.8b, v5.8b, v23.8b
uqsub v6.8b, v6.8b, v23.8b
uqsub v7.8b, v7.8b, v23.8b
st1 {v0.8b}, [x0], x2
st1 {v1.8b}, [x0], x2
st1 {v2.8b}, [x0], x2
st1 {v3.8b}, [x0], x2
st1 {v4.8b}, [x0], x2
st1 {v5.8b}, [x0], x2
st1 {v6.8b}, [x0], x2
st1 {v7.8b}, [x0], x2
ret
endfunc
.macro ADD16x4_IDCT_DC dc
ld1 {v4.16b}, [x0], x3
dup v24.8h, \dc[0]
dup v25.8h, \dc[1]
ld1 {v5.16b}, [x0], x3
dup v26.8h, \dc[2]
dup v27.8h, \dc[3]
ld1 {v6.16b}, [x0], x3
trn1 v24.2d, v24.2d, v25.2d
ld1 {v7.16b}, [x0], x3
trn1 v25.2d, v26.2d, v27.2d
neg v26.8h, v24.8h
neg v27.8h, v25.8h
sqxtun v20.8b, v24.8h
sqxtun v21.8b, v26.8h
sqxtun2 v20.16b, v25.8h
sqxtun2 v21.16b, v27.8h
uqadd v4.16b, v4.16b, v20.16b
uqadd v5.16b, v5.16b, v20.16b
uqadd v6.16b, v6.16b, v20.16b
uqadd v7.16b, v7.16b, v20.16b
uqsub v4.16b, v4.16b, v21.16b
uqsub v5.16b, v5.16b, v21.16b
uqsub v6.16b, v6.16b, v21.16b
st1 {v4.16b}, [x2], x3
uqsub v7.16b, v7.16b, v21.16b
st1 {v5.16b}, [x2], x3
st1 {v6.16b}, [x2], x3
st1 {v7.16b}, [x2], x3
.endm
function add16x16_idct_dc_neon, export=1
mov x2, x0
mov x3, #FDEC_STRIDE
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
srshr v0.4h, v0.4h, #6
srshr v1.4h, v1.4h, #6
ADD16x4_IDCT_DC v0.h
srshr v2.4h, v2.4h, #6
ADD16x4_IDCT_DC v1.h
srshr v3.4h, v3.4h, #6
ADD16x4_IDCT_DC v2.h
ADD16x4_IDCT_DC v3.h
ret
endfunc
.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
ld1 {\t0\().8b}, [x1], x3
ld1 {\t1\().8b}, [x2], x4
ld1 {\t2\().8b}, [x1], x3
ld1 {\t3\().8b}, [x2], x4
usubl \t0\().8h, \t0\().8b, \t1\().8b
ld1 {\t4\().8b}, [x1], x3
ld1 {\t5\().8b}, [x2], x4
usubl \t1\().8h, \t2\().8b, \t3\().8b
ld1 {\t6\().8b}, [x1], x3
ld1 {\t7\().8b}, [x2], x4
add \dst\().8h, \t0\().8h, \t1\().8h
usubl \t2\().8h, \t4\().8b, \t5\().8b
usubl \t3\().8h, \t6\().8b, \t7\().8b
add \dst\().8h, \dst\().8h, \t2\().8h
add \dst\().8h, \dst\().8h, \t3\().8h
.endm
function sub8x8_dct_dc_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
transpose v2.2d, v3.2d, v0.2d, v1.2d
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
addp v0.8h, v2.8h, v3.8h
addp v0.8h, v0.8h, v0.8h
st1 {v0.4h}, [x0]
ret
endfunc
function sub8x16_dct_dc_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
addp v4.8h, v0.8h, v2.8h
addp v5.8h, v1.8h, v3.8h
transpose v2.4s, v3.4s, v4.4s, v5.4s
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.4s, v3.4s, v0.4s, v1.4s
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
trn1 v2.2d, v0.2d, v1.2d
trn2 v3.2d, v1.2d, v0.2d
addp v0.8h, v2.8h, v3.8h
st1 {v0.8h}, [x0]
ret
endfunc
function zigzag_interleave_8x8_cavlc_neon, export=1
mov x3, #7
movi v31.4s, #1
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
umax v16.8h, v0.8h, v4.8h
umax v17.8h, v1.8h, v5.8h
umax v18.8h, v2.8h, v6.8h
umax v19.8h, v3.8h, v7.8h
st1 {v0.8h}, [x0], #16
st1 {v4.8h}, [x0], #16
umaxp v16.8h, v16.8h, v17.8h
umaxp v18.8h, v18.8h, v19.8h
st1 {v1.8h}, [x0], #16
st1 {v5.8h}, [x0], #16
umaxp v16.8h, v16.8h, v18.8h
st1 {v2.8h}, [x0], #16
st1 {v6.8h}, [x0], #16
cmhs v16.4s, v16.4s, v31.4s
st1 {v3.8h}, [x0], #16
and v16.16b, v16.16b, v31.16b
st1 {v7.8h}, [x0], #16
st1 {v16.b}[0], [x2], #1
st1 {v16.b}[4], [x2], x3
st1 {v16.b}[8], [x2], #1
st1 {v16.b}[12], [x2]
ret
endfunc
function zigzag_scan_4x4_frame_neon, export=1
movrel x2, scan4x4_frame
ld1 {v0.16b,v1.16b}, [x1]
ld1 {v16.16b,v17.16b}, [x2]
tbl v2.16b, {v0.16b,v1.16b}, v16.16b
tbl v3.16b, {v0.16b,v1.16b}, v17.16b
st1 {v2.16b,v3.16b}, [x0]
ret
endfunc
.macro zigzag_sub_4x4 f ac
function zigzag_sub_4x4\ac\()_\f\()_neon, export=1
mov x9, #FENC_STRIDE
mov x4, #FDEC_STRIDE
movrel x5, sub4x4_\f
mov x6, x2
ld1 {v0.s}[0], [x1], x9
ld1 {v0.s}[1], [x1], x9
ld1 {v0.s}[2], [x1], x9
ld1 {v0.s}[3], [x1], x9
ld1 {v16.16b}, [x5]
ld1 {v1.s}[0], [x2], x4
ld1 {v1.s}[1], [x2], x4
ld1 {v1.s}[2], [x2], x4
ld1 {v1.s}[3], [x2], x4
tbl v2.16b, {v0.16b}, v16.16b
tbl v3.16b, {v1.16b}, v16.16b
st1 {v0.s}[0], [x6], x4
usubl v4.8h, v2.8b, v3.8b
.ifc \ac, ac
dup h7, v4.h[0]
ins v4.h[0], wzr
fmov w5, s7
strh w5, [x3]
.endif
usubl2 v5.8h, v2.16b, v3.16b
st1 {v0.s}[1], [x6], x4
umax v6.8h, v4.8h, v5.8h
umaxv h6, v6.8h
st1 {v0.s}[2], [x6], x4
fmov w7, s6
st1 {v0.s}[3], [x6], x4
cmp w7, #0
st1 {v4.8h,v5.8h}, [x0]
cset w0, ne
ret
endfunc
.endm
zigzag_sub_4x4 field
zigzag_sub_4x4 field, ac
zigzag_sub_4x4 frame
zigzag_sub_4x4 frame, ac
function zigzag_scan_4x4_field_neon, export=1
movrel x2, scan4x4_field
ld1 {v0.8h,v1.8h}, [x1]
ld1 {v16.16b}, [x2]
tbl v0.16b, {v0.16b}, v16.16b
st1 {v0.8h,v1.8h}, [x0]
ret
endfunc
function zigzag_scan_8x8_frame_neon, export=1
movrel x2, scan8x8_frame
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
ld1 {v4.8h,v5.8h}, [x1], #32
ld1 {v6.8h,v7.8h}, [x1]
ld1 {v16.16b,v17.16b}, [x2], #32
ld1 {v18.16b,v19.16b}, [x2], #32
ld1 {v20.16b,v21.16b}, [x2], #32
ld1 {v22.16b,v23.16b}, [x2], #32
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
mov v25.h[6], v4.h[0]
mov v25.h[7], v5.h[0]
mov v26.h[0], v4.h[1]
mov v27.h[4], v7.h[0]
mov v28.h[7], v4.h[4]
mov v29.h[7], v3.h[6]
mov v30.h[0], v2.h[7]
mov v30.h[1], v3.h[7]
st1 {v24.8h,v25.8h}, [x0], #32
st1 {v26.8h,v27.8h}, [x0], #32
st1 {v28.8h,v29.8h}, [x0], #32
st1 {v30.8h,v31.8h}, [x0]
ret
endfunc
#define Z(z) 2*(z), 2*(z)+1
#define T(x,y) Z(x*8+y)
const scan8x8_frame, align=5
.byte T(0,0), T(1,0), T(0,1), T(0,2)
.byte T(1,1), T(2,0), T(3,0), T(2,1)
.byte T(1,2), T(0,3), T(0,4), T(1,3)
.byte T(2,2), T(3,1), T(4,0), T(5,0)
.byte T(4,1), T(3,2), T(2,3), T(1,4)
.byte T(0,5), T(0,6), T(1,5), T(2,4)
#undef T
#define T(x,y) Z((x-3)*8+y)
.byte T(3,3), T(4,2), T(5,1), T(6,0)
.byte T(7,0), T(6,1), T(5,2), T(4,3)
#undef T
#define T(x,y) Z((x-0)*8+y)
.byte T(3,4), T(2,5), T(1,6), T(0,7)
.byte T(1,7), T(2,6), T(3,5), T(4,4)
#undef T
#define T(x,y) Z((x-4)*8+y)
.byte T(5,3), T(6,2), T(7,1), T(7,2)
.byte T(6,3), T(5,4), T(4,5), T(3,6)
.byte T(2,7), T(3,7), T(4,6), T(5,5)
.byte T(6,4), T(7,3), T(7,4), T(6,5)
.byte T(5,6), T(4,7), T(5,7), T(6,6)
.byte T(7,5), T(7,6), T(6,7), T(7,7)
endconst
function zigzag_scan_8x8_field_neon, export=1
movrel x2, scan8x8_field
ld1 {v0.8h,v1.8h}, [x1], #32
ld1 {v2.8h,v3.8h}, [x1], #32
ld1 {v4.8h,v5.8h}, [x1], #32
ld1 {v6.8h,v7.8h}, [x1]
ld1 {v16.16b,v17.16b}, [x2], #32
ld1 {v18.16b,v19.16b}, [x2], #32
ld1 {v20.16b,v21.16b}, [x2], #32
ld1 {v22.16b}, [x2]
ext v31.16b, v7.16b, v7.16b, #4
tbl v24.16b, {v0.16b,v1.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
ext v31.16b, v6.16b, v31.16b, #12
st1 {v24.8h,v25.8h}, [x0], #32
st1 {v26.8h,v27.8h}, [x0], #32
st1 {v28.8h,v29.8h}, [x0], #32
st1 {v30.8h,v31.8h}, [x0]
ret
endfunc
.macro zigzag_sub8x8 f
function zigzag_sub_8x8_\f\()_neon, export=1
movrel x4, sub8x8_\f
mov x5, #FENC_STRIDE
mov x6, #FDEC_STRIDE
mov x7, x2
ld1 {v0.d}[0], [x1], x5
ld1 {v0.d}[1], [x1], x5
ld1 {v1.d}[0], [x1], x5
ld1 {v1.d}[1], [x1], x5
ld1 {v2.d}[0], [x1], x5
ld1 {v2.d}[1], [x1], x5
ld1 {v3.d}[0], [x1], x5
ld1 {v3.d}[1], [x1]
ld1 {v4.d}[0], [x2], x6
ld1 {v4.d}[1], [x2], x6
ld1 {v5.d}[0], [x2], x6
ld1 {v5.d}[1], [x2], x6
ld1 {v6.d}[0], [x2], x6
ld1 {v6.d}[1], [x2], x6
ld1 {v7.d}[0], [x2], x6
ld1 {v7.d}[1], [x2]
ld1 {v16.16b,v17.16b}, [x4], #32
ld1 {v18.16b,v19.16b}, [x4], #32
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
usubl v4.8h, v24.8b, v28.8b
usubl2 v5.8h, v24.16b, v28.16b
usubl v6.8h, v25.8b, v29.8b
usubl2 v7.8h, v25.16b, v29.16b
usubl v16.8h, v26.8b, v30.8b
usubl2 v17.8h, v26.16b, v30.16b
usubl v18.8h, v27.8b, v31.8b
usubl2 v19.8h, v27.16b, v31.16b
umax v20.8h, v4.8h, v5.8h
umax v21.8h, v6.8h, v7.8h
umax v22.8h, v16.8h, v17.8h
umax v23.8h, v18.8h, v19.8h
umax v20.8h, v20.8h, v21.8h
umax v21.8h, v22.8h, v23.8h
umax v20.8h, v20.8h, v21.8h
umaxv h22, v20.8h
st1 {v0.d}[0], [x7], x6
st1 {v0.d}[1], [x7], x6
st1 {v1.d}[0], [x7], x6
st1 {v1.d}[1], [x7], x6
st1 {v2.d}[0], [x7], x6
st1 {v2.d}[1], [x7], x6
st1 {v3.d}[0], [x7], x6
st1 {v3.d}[1], [x7]
st1 {v4.8h,v5.8h}, [x0], #32
st1 {v6.8h,v7.8h}, [x0], #32
st1 {v16.8h,v17.8h}, [x0], #32
st1 {v18.8h,v19.8h}, [x0]
fmov w9, s22
cmp w9, #0
cset w0, ne
ret
endfunc
.endm
zigzag_sub8x8 field
zigzag_sub8x8 frame
#undef T
#define T(x,y) Z(x*8+y)
const scan8x8_field, align=5
.byte T(0,0), T(0,1), T(0,2), T(1,0)
.byte T(1,1), T(0,3), T(0,4), T(1,2)
.byte T(2,0), T(1,3), T(0,5), T(0,6)
.byte T(0,7), T(1,4), T(2,1), T(3,0)
#undef T
#define T(x,y) Z((x-1)*8+y)
.byte T(2,2), T(1,5), T(1,6), T(1,7)
.byte T(2,3), T(3,1), T(4,0), T(3,2)
#undef T
#define T(x,y) Z((x-2)*8+y)
.byte T(2,4), T(2,5), T(2,6), T(2,7)
.byte T(3,3), T(4,1), T(5,0), T(4,2)
#undef T
#define T(x,y) Z((x-3)*8+y)
.byte T(3,4), T(3,5), T(3,6), T(3,7)
.byte T(4,3), T(5,1), T(6,0), T(5,2)
#undef T
#define T(x,y) Z((x-4)*8+y)
.byte T(4,4), T(4,5), T(4,6), T(4,7)
.byte T(5,3), T(6,1), T(6,2), T(5,4)
#undef T
#define T(x,y) Z((x-5)*8+y)
.byte T(5,5), T(5,6), T(5,7), T(6,3)
.byte T(7,0), T(7,1), T(6,4), T(6,5)
endconst
#undef T
#define T(y,x) x*8+y
const sub8x8_frame, align=5
.byte T(0,0), T(1,0), T(0,1), T(0,2)
.byte T(1,1), T(2,0), T(3,0), T(2,1)
.byte T(1,2), T(0,3), T(0,4), T(1,3)
.byte T(2,2), T(3,1), T(4,0), T(5,0)
.byte T(4,1), T(3,2), T(2,3), T(1,4)
.byte T(0,5), T(0,6), T(1,5), T(2,4)
.byte T(3,3), T(4,2), T(5,1), T(6,0)
.byte T(7,0), T(6,1), T(5,2), T(4,3)
.byte T(3,4), T(2,5), T(1,6), T(0,7)
.byte T(1,7), T(2,6), T(3,5), T(4,4)
.byte T(5,3), T(6,2), T(7,1), T(7,2)
.byte T(6,3), T(5,4), T(4,5), T(3,6)
.byte T(2,7), T(3,7), T(4,6), T(5,5)
.byte T(6,4), T(7,3), T(7,4), T(6,5)
.byte T(5,6), T(4,7), T(5,7), T(6,6)
.byte T(7,5), T(7,6), T(6,7), T(7,7)
endconst
const sub8x8_field, align=5
.byte T(0,0), T(0,1), T(0,2), T(1,0)
.byte T(1,1), T(0,3), T(0,4), T(1,2)
.byte T(2,0), T(1,3), T(0,5), T(0,6)
.byte T(0,7), T(1,4), T(2,1), T(3,0)
.byte T(2,2), T(1,5), T(1,6), T(1,7)
.byte T(2,3), T(3,1), T(4,0), T(3,2)
.byte T(2,4), T(2,5), T(2,6), T(2,7)
.byte T(3,3), T(4,1), T(5,0), T(4,2)
.byte T(3,4), T(3,5), T(3,6), T(3,7)
.byte T(4,3), T(5,1), T(6,0), T(5,2)
.byte T(4,4), T(4,5), T(4,6), T(4,7)
.byte T(5,3), T(6,1), T(6,2), T(5,4)
.byte T(5,5), T(5,6), T(5,7), T(6,3)
.byte T(7,0), T(7,1), T(6,4), T(6,5)
.byte T(6,6), T(6,7), T(7,2), T(7,3)
.byte T(7,4), T(7,5), T(7,6), T(7,7)
endconst

103
common/aarch64/dct.h Normal file
View File

@@ -0,0 +1,103 @@
/*****************************************************************************
* dct.h: aarch64 transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_DCT_H
#define X264_AARCH64_DCT_H
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
void x264_dct4x4dc_neon( int16_t d[16] );
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
void x264_idct4x4dc_neon( int16_t d[16] );
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon)
void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon)
void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon)
void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon)
int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon)
int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
#define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon)
int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon)
int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
#define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon)
int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon)
int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
#define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif

View File

@@ -0,0 +1,43 @@
/*****************************************************************************
* deblock-a-common.S: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* Janne Grunau <janne-x264@jannau.net>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros that are intended to be used by
// the SVE/SVE2 functions as well
.macro h264_loop_filter_start
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
mov v24.s[0], w6
and w8, w6, w6, lsl #16
b.eq 1f
ands w8, w8, w8, lsl #8
b.ge 2f
1:
ret
2:
.endm

View File

@@ -0,0 +1,98 @@
/*****************************************************************************
* deblock-a-sve.S: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "deblock-a-common.S"
ENABLE_SVE
.macro h264_loop_filter_chroma_sve
ptrue p0.b, vl16
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v4.8h, v0.8b
uxtl2 v5.8h, v0.16b
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
usubw v4.8h, v4.8h, v16.8b
usubw2 v5.8h, v5.8h, v16.16b
sli v24.8h, v24.8h, #8
shl v4.8h, v4.8h, #2
shl v5.8h, v5.8h, #2
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
uxtl v24.4s, v24.4h
uaddw v4.8h, v4.8h, v18.8b
uaddw2 v5.8h, v5.8h, v18.16b
cmphi p1.b, p0/z, z22.b, z26.b
usubw v4.8h, v4.8h, v2.8b
usubw2 v5.8h, v5.8h, v2.16b
sli v24.4s, v24.4s, #16
dup v22.16b, w3 // beta
rshrn v4.8b, v4.8h, #3
rshrn2 v4.16b, v5.8h, #3
cmphi p2.b, p0/z, z22.b, z28.b
cmphi p3.b, p0/z, z22.b, z30.b
smin v4.16b, v4.16b, v24.16b
neg v25.16b, v24.16b
and p1.b, p0/z, p1.b, p2.b
smax v4.16b, v4.16b, v25.16b
and p1.b, p0/z, p1.b, p3.b
uxtl v22.8h, v0.8b
uxtl2 v23.8h, v0.16b
uxtl v28.8h, v16.8b
uxtl2 v29.8h, v16.16b
saddw v28.8h, v28.8h, v4.8b
saddw2 v29.8h, v29.8h, v4.16b
ssubw v22.8h, v22.8h, v4.8b
ssubw2 v23.8h, v23.8h, v4.16b
sqxtun v16.8b, v28.8h
sqxtun v0.8b, v22.8h
sqxtun2 v16.16b, v29.8h
sqxtun2 v0.16b, v23.8h
.endm
function deblock_v_chroma_sve, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
// No performance improvement if sve load is used. So, continue using
// NEON load here
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
ld1 {v0.16b}, [x0], x1
ld1 {v2.16b}, [x0]
h264_loop_filter_chroma_sve
sub x0, x0, x1, lsl #1
st1b {z16.b}, p1, [x0]
add x0, x0, x1
st1b {z0.b}, p1, [x0]
ret
endfunc

800
common/aarch64/deblock-a.S Normal file
View File

@@ -0,0 +1,800 @@
/*****************************************************************************
* deblock.S: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "deblock-a-common.S"
.macro h264_loop_filter_luma
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v24.4s, v24.4h
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
sli v24.8h, v24.8h, #8
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
sli v24.4s, v24.4s, #16
cmhi v21.16b, v22.16b, v21.16b // < alpha
dup v22.16b, w3 // beta
cmlt v23.16b, v24.16b, #0
cmhi v28.16b, v22.16b, v28.16b // < beta
cmhi v30.16b, v22.16b, v30.16b // < beta
bic v21.16b, v21.16b, v23.16b
uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
and v21.16b, v21.16b, v28.16b
uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v22.16b, v17.16b // < beta
and v21.16b, v21.16b, v30.16b
cmhi v19.16b, v22.16b, v19.16b // < beta
and v17.16b, v17.16b, v21.16b
and v19.16b, v19.16b, v21.16b
and v24.16b, v24.16b, v21.16b
urhadd v28.16b, v16.16b, v0.16b
sub v21.16b, v24.16b, v17.16b
uqadd v23.16b, v18.16b, v24.16b
uhadd v20.16b, v20.16b, v28.16b
sub v21.16b, v21.16b, v19.16b
uhadd v28.16b, v4.16b, v28.16b
umin v23.16b, v23.16b, v20.16b
uqsub v22.16b, v18.16b, v24.16b
uqadd v4.16b, v2.16b, v24.16b
umax v23.16b, v23.16b, v22.16b
uqsub v22.16b, v2.16b, v24.16b
umin v28.16b, v4.16b, v28.16b
uxtl v4.8h, v0.8b
umax v28.16b, v28.16b, v22.16b
uxtl2 v20.8h, v0.16b
usubw v4.8h, v4.8h, v16.8b
usubw2 v20.8h, v20.8h, v16.16b
shl v4.8h, v4.8h, #2
shl v20.8h, v20.8h, #2
uaddw v4.8h, v4.8h, v18.8b
uaddw2 v20.8h, v20.8h, v18.16b
usubw v4.8h, v4.8h, v2.8b
usubw2 v20.8h, v20.8h, v2.16b
rshrn v4.8b, v4.8h, #3
rshrn2 v4.16b, v20.8h, #3
bsl v17.16b, v23.16b, v18.16b
bsl v19.16b, v28.16b, v2.16b
neg v23.16b, v21.16b
uxtl v28.8h, v16.8b
smin v4.16b, v4.16b, v21.16b
uxtl2 v21.8h, v16.16b
smax v4.16b, v4.16b, v23.16b
uxtl v22.8h, v0.8b
uxtl2 v24.8h, v0.16b
saddw v28.8h, v28.8h, v4.8b
saddw2 v21.8h, v21.8h, v4.16b
ssubw v22.8h, v22.8h, v4.8b
ssubw2 v24.8h, v24.8h, v4.16b
sqxtun v16.8b, v28.8h
sqxtun2 v16.16b, v21.8h
sqxtun v0.8b, v22.8h
sqxtun2 v0.16b, v24.8h
.endm
function deblock_v_luma_neon, export=1
h264_loop_filter_start
ld1 {v0.16b}, [x0], x1
ld1 {v2.16b}, [x0], x1
ld1 {v4.16b}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ld1 {v20.16b}, [x0], x1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
st1 {v17.16b}, [x0], x1
st1 {v16.16b}, [x0], x1
st1 {v0.16b}, [x0], x1
st1 {v19.16b}, [x0]
ret
endfunc
function deblock_h_luma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
ld1 {v6.8b}, [x0], x1
ld1 {v20.8b}, [x0], x1
ld1 {v18.8b}, [x0], x1
ld1 {v16.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v4.8b}, [x0], x1
ld1 {v26.8b}, [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v20.d}[1], [x0], x1
ld1 {v18.d}[1], [x0], x1
ld1 {v16.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v26.d}[1], [x0], x1
transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
h264_loop_filter_luma
transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
sub x0, x0, x1, lsl #4
add x0, x0, #2
st1 {v17.s}[0], [x0], x1
st1 {v16.s}[0], [x0], x1
st1 {v0.s}[0], [x0], x1
st1 {v19.s}[0], [x0], x1
st1 {v17.s}[1], [x0], x1
st1 {v16.s}[1], [x0], x1
st1 {v0.s}[1], [x0], x1
st1 {v19.s}[1], [x0], x1
st1 {v17.s}[2], [x0], x1
st1 {v16.s}[2], [x0], x1
st1 {v0.s}[2], [x0], x1
st1 {v19.s}[2], [x0], x1
st1 {v17.s}[3], [x0], x1
st1 {v16.s}[3], [x0], x1
st1 {v0.s}[3], [x0], x1
st1 {v19.s}[3], [x0], x1
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cmp w4, #0
b.ne 1f
ret
1:
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_luma_intra
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
cmhi v19.16b, v30.16b, v16.16b // < alpha
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
movi v29.16b, #2
ushr v30.16b, v30.16b, #2 // alpha >> 2
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
and v19.16b, v19.16b, v17.16b
and v19.16b, v19.16b, v18.16b
shrn v20.8b, v19.8h, #4
mov x4, v20.d[0]
cbz x4, 9f
ushll v20.8h, v6.8b, #1
ushll v22.8h, v1.8b, #1
ushll2 v21.8h, v6.16b, #1
ushll2 v23.8h, v1.16b, #1
uaddw v20.8h, v20.8h, v7.8b
uaddw v22.8h, v22.8h, v0.8b
uaddw2 v21.8h, v21.8h, v7.16b
uaddw2 v23.8h, v23.8h, v0.16b
uaddw v20.8h, v20.8h, v1.8b
uaddw v22.8h, v22.8h, v6.8b
uaddw2 v21.8h, v21.8h, v1.16b
uaddw2 v23.8h, v23.8h, v6.16b
rshrn v24.8b, v20.8h, #2 // p0'_1
rshrn v25.8b, v22.8h, #2 // q0'_1
rshrn2 v24.16b, v21.8h, #2 // p0'_1
rshrn2 v25.16b, v23.8h, #2 // q0'_1
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
and v17.16b, v16.16b, v17.16b // if_2 && if_3
and v18.16b, v16.16b, v18.16b // if_2 && if_4
not v30.16b, v17.16b
not v31.16b, v18.16b
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
uaddl v26.8h, v5.8b, v7.8b
uaddl2 v27.8h, v5.16b, v7.16b
uaddw v26.8h, v26.8h, v0.8b
uaddw2 v27.8h, v27.8h, v0.16b
add v20.8h, v20.8h, v26.8h
add v21.8h, v21.8h, v27.8h
uaddw v20.8h, v20.8h, v0.8b
uaddw2 v21.8h, v21.8h, v0.16b
rshrn v20.8b, v20.8h, #3 // p0'_2
rshrn2 v20.16b, v21.8h, #3 // p0'_2
uaddw v26.8h, v26.8h, v6.8b
uaddw2 v27.8h, v27.8h, v6.16b
rshrn v21.8b, v26.8h, #2 // p1'_2
rshrn2 v21.16b, v27.8h, #2 // p1'_2
uaddl v28.8h, v4.8b, v5.8b
uaddl2 v29.8h, v4.16b, v5.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v19.8b, v28.8h, #3 // p2'_2
rshrn2 v19.16b, v29.8h, #3 // p2'_2
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
uaddl v26.8h, v2.8b, v0.8b
uaddl2 v27.8h, v2.16b, v0.16b
uaddw v26.8h, v26.8h, v7.8b
uaddw2 v27.8h, v27.8h, v7.16b
add v22.8h, v22.8h, v26.8h
add v23.8h, v23.8h, v27.8h
uaddw v22.8h, v22.8h, v7.8b
uaddw2 v23.8h, v23.8h, v7.16b
rshrn v22.8b, v22.8h, #3 // q0'_2
rshrn2 v22.16b, v23.8h, #3 // q0'_2
uaddw v26.8h, v26.8h, v1.8b
uaddw2 v27.8h, v27.8h, v1.16b
rshrn v23.8b, v26.8h, #2 // q1'_2
rshrn2 v23.16b, v27.8h, #2 // q1'_2
uaddl v28.8h, v2.8b, v3.8b
uaddl2 v29.8h, v2.16b, v3.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v26.8b, v28.8h, #3 // q2'_2
rshrn2 v26.16b, v29.8h, #3 // q2'_2
bit v7.16b, v24.16b, v30.16b // p0'_1
bit v0.16b, v25.16b, v31.16b // q0'_1
bit v7.16b, v20.16b, v17.16b // p0'_2
bit v6.16b, v21.16b, v17.16b // p1'_2
bit v5.16b, v19.16b, v17.16b // p2'_2
bit v0.16b, v22.16b, v18.16b // q0'_2
bit v1.16b, v23.16b, v18.16b // q1'_2
bit v2.16b, v26.16b, v18.16b // q2'_2
.endm
function deblock_v_luma_intra_neon, export=1
h264_loop_filter_start_intra
ld1 {v0.16b}, [x0], x1 // q0
ld1 {v1.16b}, [x0], x1 // q1
ld1 {v2.16b}, [x0], x1 // q2
ld1 {v3.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #3
ld1 {v4.16b}, [x0], x1 // p3
ld1 {v5.16b}, [x0], x1 // p2
ld1 {v6.16b}, [x0], x1 // p1
ld1 {v7.16b}, [x0] // p0
h264_loop_filter_luma_intra
sub x0, x0, x1, lsl #1
st1 {v5.16b}, [x0], x1 // p2
st1 {v6.16b}, [x0], x1 // p1
st1 {v7.16b}, [x0], x1 // p0
st1 {v0.16b}, [x0], x1 // q0
st1 {v1.16b}, [x0], x1 // q1
st1 {v2.16b}, [x0] // q2
9:
ret
endfunc
function deblock_h_luma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, #4
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v5.d}[1], [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v1.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v3.d}[1], [x0], x1
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
h264_loop_filter_luma_intra
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
sub x0, x0, x1, lsl #4
st1 {v4.8b}, [x0], x1
st1 {v5.8b}, [x0], x1
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
st1 {v4.d}[1], [x0], x1
st1 {v5.d}[1], [x0], x1
st1 {v6.d}[1], [x0], x1
st1 {v7.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
st1 {v3.d}[1], [x0], x1
9:
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v4.8h, v0.8b
uxtl2 v5.8h, v0.16b
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
usubw v4.8h, v4.8h, v16.8b
usubw2 v5.8h, v5.8h, v16.16b
sli v24.8h, v24.8h, #8
shl v4.8h, v4.8h, #2
shl v5.8h, v5.8h, #2
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
uxtl v24.4s, v24.4h
uaddw v4.8h, v4.8h, v18.8b
uaddw2 v5.8h, v5.8h, v18.16b
cmhi v26.16b, v22.16b, v26.16b // < alpha
usubw v4.8h, v4.8h, v2.8b
usubw2 v5.8h, v5.8h, v2.16b
sli v24.4s, v24.4s, #16
dup v22.16b, w3 // beta
rshrn v4.8b, v4.8h, #3
rshrn2 v4.16b, v5.8h, #3
cmhi v28.16b, v22.16b, v28.16b // < beta
cmhi v30.16b, v22.16b, v30.16b // < beta
smin v4.16b, v4.16b, v24.16b
neg v25.16b, v24.16b
and v26.16b, v26.16b, v28.16b
smax v4.16b, v4.16b, v25.16b
and v26.16b, v26.16b, v30.16b
uxtl v22.8h, v0.8b
uxtl2 v23.8h, v0.16b
and v4.16b, v4.16b, v26.16b
uxtl v28.8h, v16.8b
uxtl2 v29.8h, v16.16b
saddw v28.8h, v28.8h, v4.8b
saddw2 v29.8h, v29.8h, v4.16b
ssubw v22.8h, v22.8h, v4.8b
ssubw2 v23.8h, v23.8h, v4.16b
sqxtun v16.8b, v28.8h
sqxtun v0.8b, v22.8h
sqxtun2 v16.16b, v29.8h
sqxtun2 v0.16b, v23.8h
.endm
function deblock_v_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
ld1 {v0.16b}, [x0], x1
ld1 {v2.16b}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
st1 {v16.16b}, [x0], x1
st1 {v0.16b}, [x0], x1
ret
endfunc
function deblock_h_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
deblock_h_chroma:
ld1 {v18.d}[0], [x0], x1
ld1 {v16.d}[0], [x0], x1
ld1 {v0.d}[0], [x0], x1
ld1 {v2.d}[0], [x0], x1
ld1 {v18.d}[1], [x0], x1
ld1 {v16.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
st1 {v18.d}[0], [x0], x1
st1 {v16.d}[0], [x0], x1
st1 {v0.d}[0], [x0], x1
st1 {v2.d}[0], [x0], x1
st1 {v18.d}[1], [x0], x1
st1 {v16.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
ret
endfunc
function deblock_h_chroma_422_neon, export=1
add x5, x0, x1
sub x0, x0, #4
add x1, x1, x1
h264_loop_filter_start
mov x7, x30
bl deblock_h_chroma
mov x30, x7
sub x0, x5, #4
mov v24.s[0], w6
b deblock_h_chroma
endfunc
.macro h264_loop_filter_chroma8
dup v22.8b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
uxtl v4.8h, v17.8b
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
usubw v4.8h, v4.8h, v16.8b
sli v24.8h, v24.8h, #8
shl v4.8h, v4.8h, #2
uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
uaddw v4.8h, v4.8h, v18.8b
cmhi v26.8b, v22.8b, v26.8b // < alpha
usubw v4.8h, v4.8h, v19.8b
dup v22.8b, w3 // beta
rshrn v4.8b, v4.8h, #3
cmhi v28.8b, v22.8b, v28.8b // < beta
cmhi v30.8b, v22.8b, v30.8b // < beta
smin v4.8b, v4.8b, v24.8b
neg v25.8b, v24.8b
and v26.8b, v26.8b, v28.8b
smax v4.8b, v4.8b, v25.8b
and v26.8b, v26.8b, v30.8b
uxtl v22.8h, v17.8b
and v4.8b, v4.8b, v26.8b
uxtl v28.8h, v16.8b
saddw v28.8h, v28.8h, v4.8b
ssubw v22.8h, v22.8h, v4.8b
sqxtun v16.8b, v28.8h
sqxtun v17.8b, v22.8h
.endm
function deblock_h_chroma_mbaff_neon, export=1
h264_loop_filter_start
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4]
transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
h264_loop_filter_chroma8
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0]
ret
endfunc
.macro h264_loop_filter_chroma_intra width=16
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
cmhi v26.16b, v30.16b, v26.16b // < alpha
cmhi v27.16b, v31.16b, v27.16b // < beta
cmhi v28.16b, v31.16b, v28.16b // < beta
and v26.16b, v26.16b, v27.16b
and v26.16b, v26.16b, v28.16b
ushll v4.8h, v18.8b, #1
ushll v6.8h, v19.8b, #1
.ifc \width, 16
ushll2 v5.8h, v18.16b, #1
ushll2 v7.8h, v19.16b, #1
uaddl2 v21.8h, v16.16b, v19.16b
uaddl2 v23.8h, v17.16b, v18.16b
.endif
uaddl v20.8h, v16.8b, v19.8b
uaddl v22.8h, v17.8b, v18.8b
add v20.8h, v20.8h, v4.8h // mlal?
add v22.8h, v22.8h, v6.8h
.ifc \width, 16
add v21.8h, v21.8h, v5.8h
add v23.8h, v23.8h, v7.8h
.endif
uqrshrn v24.8b, v20.8h, #2
uqrshrn v25.8b, v22.8h, #2
.ifc \width, 16
uqrshrn2 v24.16b, v21.8h, #2
uqrshrn2 v25.16b, v23.8h, #2
.endif
bit v16.16b, v24.16b, v26.16b
bit v17.16b, v25.16b, v26.16b
.endm
function deblock_v_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, x1, lsl #1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
ld1 {v17.16b}, [x0], x1
ld1 {v19.16b}, [x0]
h264_loop_filter_chroma_intra
sub x0, x0, x1, lsl #1
st1 {v16.16b}, [x0], x1
st1 {v17.16b}, [x0], x1
ret
endfunc
function deblock_h_chroma_intra_mbaff_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra width=8
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
ret
endfunc
function deblock_h_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ret
endfunc
function deblock_h_chroma_422_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ret
endfunc
// void deblock_strength( uint8_t nnz[X264_SCAN8_SIZE],
// int8_t ref[2][X264_SCAN8_LUMA_SIZE],
// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
// uint8_t bs[2][8][4], int mvy_limit,
// int bframe )
function deblock_strength_neon, export=1
movi v4.16b, #0
lsl w4, w4, #8
add x3, x3, #32
sub w4, w4, #(1<<8)-3
movi v5.16b, #0
dup v6.8h, w4
mov x6, #-32
bframe:
// load bytes ref
add x2, x2, #16
ld1 {v31.d}[1], [x1], #8
ld1 {v1.16b}, [x1], #16
movi v0.16b, #0
ld1 {v2.16b}, [x1], #16
ext v3.16b, v0.16b, v1.16b, #15
ext v0.16b, v0.16b, v2.16b, #15
unzip v21.4s, v22.4s, v1.4s, v2.4s
unzip v23.4s, v20.4s, v3.4s, v0.4s
ext v21.16b, v31.16b, v22.16b, #12
eor v0.16b, v20.16b, v22.16b
eor v1.16b, v21.16b, v22.16b
orr v4.16b, v4.16b, v0.16b
orr v5.16b, v5.16b, v1.16b
ld1 {v21.8h}, [x2], #16 // mv + 0x10
ld1 {v19.8h}, [x2], #16 // mv + 0x20
ld1 {v22.8h}, [x2], #16 // mv + 0x30
ld1 {v18.8h}, [x2], #16 // mv + 0x40
ld1 {v23.8h}, [x2], #16 // mv + 0x50
ext v19.16b, v19.16b, v22.16b, #12
ext v18.16b, v18.16b, v23.16b, #12
sabd v0.8h, v22.8h, v19.8h
ld1 {v19.8h}, [x2], #16 // mv + 0x60
sabd v1.8h, v23.8h, v18.8h
ld1 {v24.8h}, [x2], #16 // mv + 0x70
uqxtn v0.8b, v0.8h
ld1 {v18.8h}, [x2], #16 // mv + 0x80
ld1 {v25.8h}, [x2], #16 // mv + 0x90
uqxtn2 v0.16b, v1.8h
ext v19.16b, v19.16b, v24.16b, #12
ext v18.16b, v18.16b, v25.16b, #12
sabd v1.8h, v24.8h, v19.8h
sabd v2.8h, v25.8h, v18.8h
uqxtn v1.8b, v1.8h
uqxtn2 v1.16b, v2.8h
uqsub v0.16b, v0.16b, v6.16b
uqsub v1.16b, v1.16b, v6.16b
uqxtn v0.8b, v0.8h
uqxtn2 v0.16b, v1.8h
sabd v1.8h, v22.8h, v23.8h
orr v4.16b, v4.16b, v0.16b
sabd v0.8h, v21.8h, v22.8h
sabd v2.8h, v23.8h, v24.8h
sabd v3.8h, v24.8h, v25.8h
uqxtn v0.8b, v0.8h
uqxtn2 v0.16b, v1.8h
uqxtn v1.8b, v2.8h
uqxtn2 v1.16b, v3.8h
uqsub v0.16b, v0.16b, v6.16b
uqsub v1.16b, v1.16b, v6.16b
uqxtn v0.8b, v0.8h
uqxtn2 v0.16b, v1.8h
subs w5, w5, #1
orr v5.16b, v5.16b, v0.16b
b.eq bframe
movi v6.16b, #1
// load bytes nnz
ld1 {v31.d}[1], [x0], #8
ld1 {v1.16b}, [x0], #16
movi v0.16b, #0
ld1 {v2.16b}, [x0], #16
ext v3.16b, v0.16b, v1.16b, #15
ext v0.16b, v0.16b, v2.16b, #15
unzip v21.4s, v22.4s, v1.4s, v2.4s
unzip v23.4s, v20.4s, v3.4s, v0.4s
ext v21.16b, v31.16b, v22.16b, #12
movrel x7, transpose_table
ld1 {v7.16b}, [x7]
orr v0.16b, v20.16b, v22.16b
orr v1.16b, v21.16b, v22.16b
umin v0.16b, v0.16b, v6.16b
umin v1.16b, v1.16b, v6.16b
umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
umin v5.16b, v5.16b, v6.16b
add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
add v1.16b, v1.16b, v1.16b
umax v4.16b, v4.16b, v0.16b
umax v5.16b, v5.16b, v1.16b
tbl v6.16b, {v4.16b}, v7.16b
st1 {v5.16b}, [x3], x6 // bs[1]
st1 {v6.16b}, [x3] // bs[0]
ret
endfunc
const transpose_table
.byte 0, 4, 8, 12
.byte 1, 5, 9, 13
.byte 2, 6, 10, 14
.byte 3, 7, 11, 15
endconst

61
common/aarch64/deblock.h Normal file
View File

@@ -0,0 +1,61 @@
/*****************************************************************************
* deblock.h: aarch64 deblocking
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_DEBLOCK_H
#define X264_AARCH64_DEBLOCK_H
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#endif

View File

@@ -0,0 +1,66 @@
/****************************************************************************
* mc-a-common.S: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* Mans Rullgard <mans@mansr.com>
* Stefan Groenroos <stefan.gronroos@gmail.com>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros and functions that are intended to be used by
// the SVE/SVE2 functions as well
#if BIT_DEPTH == 8
// 0 < weight < 64
.macro load_weights_add_add
mov w6, w6
.endm
// weight > 64
.macro load_weights_add_sub
neg w7, w7
.endm
// weight < 0
.macro load_weights_sub_add
neg w6, w6
.endm
function pixel_avg_w4_neon
1: subs w9, w9, #2
ld1 {v0.s}[0], [x2], x3
ld1 {v2.s}[0], [x4], x5
urhadd v0.8b, v0.8b, v2.8b
ld1 {v1.s}[0], [x2], x3
ld1 {v3.s}[0], [x4], x5
urhadd v1.8b, v1.8b, v3.8b
st1 {v0.s}[0], [x0], x1
st1 {v1.s}[0], [x0], x1
b.gt 1b
ret
endfunc
#else // BIT_DEPTH == 10
#endif

108
common/aarch64/mc-a-sve.S Normal file
View File

@@ -0,0 +1,108 @@
/*****************************************************************************
* mc-a-sve.S: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "mc-a-common.S"
ENABLE_SVE
#if BIT_DEPTH == 8
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
// uint8_t *src1, intptr_t src1_stride,
// uint8_t *src2, intptr_t src2_stride, int weight );
.macro AVGH_SVE w h
function pixel_avg_\w\()x\h\()_sve, export=1
mov w10, #64
cmp w6, #32
mov w9, #\h
b.eq pixel_avg_w\w\()_neon
subs w7, w10, w6
b.lt pixel_avg_weight_w\w\()_add_sub_sve // weight > 64
cmp w6, #0
b.ge pixel_avg_weight_w\w\()_add_add_sve
b pixel_avg_weight_w\w\()_sub_add_sve // weight < 0
endfunc
.endm
AVGH_SVE 4, 2
AVGH_SVE 4, 4
AVGH_SVE 4, 8
AVGH_SVE 4, 16
// 0 < weight < 64
.macro weight_add_add_sve dst, s1, s2, h=
mul \dst, \s1, v30.8h
mla \dst, \s2, v31.8h
.endm
// weight > 64
.macro weight_add_sub_sve dst, s1, s2, h=
mul \dst, \s1, v30.8h
mls \dst, \s2, v31.8h
.endm
// weight < 0
.macro weight_sub_add_sve dst, s1, s2, h=
mul \dst, \s2, v31.8h
mls \dst, \s1, v30.8h
.endm
.macro AVG_WEIGHT_SVE ext
function pixel_avg_weight_w4_\ext\()_sve
load_weights_\ext
ptrue p0.b, vl8
dup v30.8h, w6
dup v31.8h, w7
1: // height loop
subs w9, w9, #2
ld1b {z0.h}, p0/z, [x2]
add x2, x2, x3
ld1b {z1.h}, p0/z, [x4]
add x4, x4, x5
weight_\ext\()_sve v4.8h, v0.8h, v1.8h
ld1b {z2.h}, p0/z, [x2]
add x2, x2, x3
ld1b {z3.h}, p0/z, [x4]
add x4, x4, x5
sqrshrun v0.8b, v4.8h, #6
weight_\ext\()_sve v5.8h, v2.8h, v3.8h
st1 {v0.s}[0], [x0], x1
sqrshrun v1.8b, v5.8h, #6
st1 {v1.s}[0], [x0], x1
b.gt 1b
ret
endfunc
.endm
AVG_WEIGHT_SVE add_add
AVG_WEIGHT_SVE add_sub
AVG_WEIGHT_SVE sub_add
#else // BIT_DEPTH == 10
#endif

3935
common/aarch64/mc-a.S Normal file

File diff suppressed because it is too large Load Diff

371
common/aarch64/mc-c.c Normal file
View File

@@ -0,0 +1,371 @@
/*****************************************************************************
* mc-c.c: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "mc.h"
#define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
void x264_prefetch_ref_aarch64( pixel *, intptr_t, int );
#define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
void x264_prefetch_fenc_420_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
void x264_prefetch_fenc_422_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
void x264_memzero_aligned_neon( void *dst, size_t n );
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
void x264_pixel_avg_16x16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
void x264_pixel_avg_16x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
void x264_pixel_avg_8x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
void x264_pixel_avg_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
void x264_pixel_avg_8x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
void x264_pixel_avg_4x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
void x264_pixel_avg_4x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
void x264_pixel_avg_4x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
void x264_pixel_avg_4x2_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve)
void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve)
void x264_pixel_avg_4x8_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve)
void x264_pixel_avg_4x4_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve)
void x264_pixel_avg_4x2_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
void x264_pixel_avg2_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
void x264_pixel_avg2_w16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
void x264_pixel_avg2_w20_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
#define MC_WEIGHT(func)\
void x264_mc_weight_w20##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w8##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w4##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
\
static void (* mc##func##_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
{\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w8##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w20##func##_neon,\
};
MC_WEIGHT()
MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
void x264_mc_copy_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
void x264_mc_copy_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
void x264_mc_copy_w16_neon( pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
void x264_mc_chroma_neon( pixel *, pixel *, intptr_t, pixel *, intptr_t, int, int, int, int );
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
void x264_integral_init4h_neon( uint16_t *, pixel *, intptr_t );
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
void x264_integral_init8h_neon( uint16_t *, pixel *, intptr_t );
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
void x264_integral_init8v_neon( uint16_t *, intptr_t );
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
void x264_frame_init_lowres_core_neon( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, intptr_t, int, int );
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
static void (* const pixel_avg_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ) =
{
NULL,
x264_pixel_avg2_w4_neon,
x264_pixel_avg2_w8_neon,
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
x264_pixel_avg2_w16_neon,
x264_pixel_avg2_w20_neon,
};
static void (* const mc_copy_wtab_neon[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_neon,
x264_mc_copy_w8_neon,
NULL,
x264_mc_copy_w16_neon,
};
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
{
if( w->i_scale == 1<<w->i_denom )
{
if( w->i_offset < 0 )
{
w->weightfn = mc_offsetsub_wtab_neon;
w->cachea[0] = -w->i_offset;
}
else
{
w->weightfn = mc_offsetadd_wtab_neon;
w->cachea[0] = w->i_offset;
}
}
else if( !w->i_denom )
w->weightfn = mc_nodenom_wtab_neon;
else
w->weightfn = mc_wtab_neon;
}
static void mc_luma_neon( pixel *dst, intptr_t i_dst_stride,
pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
}
else if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
else
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
}
static pixel *get_ref_neon( pixel *dst, intptr_t *i_dst_stride,
pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, *i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
return dst;
}
else if( weight->weightfn )
{
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
return dst;
}
else
{
*i_dst_stride = i_src_stride;
return src1;
}
}
#define x264_hpel_filter_neon x264_template(hpel_filter_neon)
void x264_hpel_filter_neon( pixel *dsth, pixel *dstv, pixel *dstc,
pixel *src, intptr_t stride, int width,
int height, int16_t *buf );
#if !HIGH_BIT_DEPTH && HAVE_I8MM
#define x264_hpel_filter_neon_i8mm x264_template(hpel_filter_neon_i8mm)
void x264_hpel_filter_neon_i8mm( pixel *dsth, pixel *dstv, pixel *dstc,
pixel *src, intptr_t stride, int width,
int height, int16_t *buf );
#endif // !HIGH_BIT_DEPTH && HAVE_I8MM
PLANE_COPY(16, neon)
PLANE_COPY_SWAP(16, neon)
PLANE_INTERLEAVE(neon)
PROPAGATE_LIST(neon)
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf )
{
if( cpu&X264_CPU_ARMV8 )
{
pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
pf->prefetch_ref = x264_prefetch_ref_aarch64;
}
if( cpu&X264_CPU_NEON )
{
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
pf->memcpy_aligned = x264_memcpy_aligned_neon;
pf->memzero_aligned = x264_memzero_aligned_neon;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
pf->weight = mc_wtab_neon;
pf->offsetadd = mc_offsetadd_wtab_neon;
pf->offsetsub = mc_offsetsub_wtab_neon;
pf->weight_cache = weight_cache_neon;
pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon;
pf->integral_init4h = x264_integral_init4h_neon;
pf->integral_init8h = x264_integral_init8h_neon;
pf->integral_init4v = x264_integral_init4v_neon;
pf->integral_init8v = x264_integral_init8v_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
pf->plane_copy = plane_copy_neon;
pf->plane_copy_swap = plane_copy_swap_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = plane_copy_interleave_neon;
pf->hpel_filter = x264_hpel_filter_neon;
}
#if !HIGH_BIT_DEPTH
#if HAVE_SVE
if( cpu&X264_CPU_SVE )
{
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sve;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sve;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sve;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sve;
}
#endif
#if HAVE_I8MM
if( cpu&X264_CPU_I8MM )
{
pf->hpel_filter = x264_hpel_filter_neon_i8mm;
}
#endif // HAVE_I8MM
#endif // !HIGH_BIT_DEPTH
}

32
common/aarch64/mc.h Normal file
View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* mc.h: aarch64 motion compensation
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_MC_H
#define X264_AARCH64_MC_H
#define x264_mc_init_aarch64 x264_template(mc_init_aarch64)
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf );
#endif

View File

@@ -0,0 +1,44 @@
/****************************************************************************
* pixel-a-common.S: aarch64 pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
// This file contains the NEON macros and constants that are intended to be used by
// the SVE/SVE2 functions as well
const mask_ac_4_8
.short 0, -1, -1, -1, 0, -1, -1, -1
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
SUMSUB_AB \s1, \d1, \a, \b
SUMSUB_AB \s2, \d2, \c, \d
.endm
.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
.endm

View File

@@ -0,0 +1,523 @@
/*****************************************************************************
* pixel-a-sve.S: aarch64 pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Chen <david.chen@myais.com.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
#include "pixel-a-common.S"
ENABLE_SVE
#if BIT_DEPTH == 8
.macro SSD_START_SVE_4
ptrue p0.h, vl4
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
sub v2.4h, v16.4h, v17.4h
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
smull v0.4s, v2.4h, v2.4h
.endm
.macro SSD_SVE_4
sub v2.4h, v16.4h, v17.4h
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
smlal v0.4s, v2.4h, v2.4h
.endm
.macro SSD_END_SVE_4
sub v2.4h, v16.4h, v17.4h
smlal v0.4s, v2.4h, v2.4h
.endm
.macro SSD_START_SVE_8
ptrue p0.h, vl8
ld1b {z16.h}, p0/z, [x0]
ld1b {z17.h}, p0/z, [x2]
add x0, x0, x1
add x2, x2, x3
sub v2.8h, v16.8h, v17.8h
ld1b {z16.h}, p0/z, [x0]
smull v0.4s, v2.4h, v2.4h
ld1b {z17.h}, p0/z, [x2]
smlal2 v0.4s, v2.8h, v2.8h
add x0, x0, x1
add x2, x2, x3
.endm
.macro SSD_SVE_8
sub v2.8h, v16.8h, v17.8h
ld1b {z16.h}, p0/z, [x0]
smlal v0.4s, v2.4h, v2.4h
ld1b {z17.h}, p0/z, [x2]
smlal2 v0.4s, v2.8h, v2.8h
add x0, x0, x1
add x2, x2, x3
.endm
.macro SSD_END_SVE_8
sub v2.8h, v16.8h, v17.8h
smlal v0.4s, v2.4h, v2.4h
smlal2 v0.4s, v2.8h, v2.8h
.endm
.macro SSD_FUNC_SVE w h
function pixel_ssd_\w\()x\h\()_sve, export=1
SSD_START_SVE_\w
.rept \h-2
SSD_SVE_\w
.endr
SSD_END_SVE_\w
addv s0, v0.4s
mov w0, v0.s[0]
ret
endfunc
.endm
.macro load_diff_fly_sve_8x8
ld1b {z1.h}, p0/z, [x2]
ld1b {z0.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z3.h}, p0/z, [x2]
ld1b {z2.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
sub v16.8h, v0.8h, v1.8h
sub v17.8h, v2.8h, v3.8h
ld1b {z5.h}, p0/z, [x2]
ld1b {z4.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z7.h}, p0/z, [x2]
ld1b {z6.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
sub v18.8h, v4.8h, v5.8h
sub v19.8h, v6.8h, v7.8h
ld1b {z1.h}, p0/z, [x2]
ld1b {z0.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z3.h}, p0/z, [x2]
ld1b {z2.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
sub v20.8h, v0.8h, v1.8h
sub v21.8h, v2.8h, v3.8h
ld1b {z5.h}, p0/z, [x2]
ld1b {z4.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
ld1b {z7.h}, p0/z, [x2]
ld1b {z6.h}, p0/z, [x0]
add x2, x2, x3
add x0, x0, x1
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
sub v22.8h, v4.8h, v5.8h
sub v23.8h, v6.8h, v7.8h
.endm
.macro pixel_var_sve_8 h
function pixel_var_8x\h\()_sve, export=1
ptrue p0.h, vl8
ld1b {z16.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z17.h}, p0/z, [x0]
add x0, x0, x1
mov x2, \h - 4
mul v1.8h, v16.8h, v16.8h
mul v2.8h, v17.8h, v17.8h
add v0.8h, v16.8h, v17.8h
ld1b {z18.h}, p0/z, [x0]
add x0, x0, x1
uaddlp v1.4s, v1.8h
uaddlp v2.4s, v2.8h
ld1b {z19.h}, p0/z, [x0]
add x0, x0, x1
1: subs x2, x2, #4
add v0.8h, v0.8h, v18.8h
mul v24.8h, v18.8h, v18.8h
ld1b {z20.h}, p0/z, [x0]
add x0, x0, x1
add v0.8h, v0.8h, v19.8h
mul v25.8h, v19.8h, v19.8h
uadalp v1.4s, v24.8h
ld1b {z21.h}, p0/z, [x0]
add x0, x0, x1
add v0.8h, v0.8h, v20.8h
mul v26.8h, v20.8h, v20.8h
uadalp v2.4s, v25.8h
ld1b {z18.h}, p0/z, [x0]
add x0, x0, x1
add v0.8h, v0.8h, v21.8h
mul v27.8h, v21.8h, v21.8h
uadalp v1.4s, v26.8h
ld1b {z19.h}, p0/z, [x0]
add x0, x0, x1
uadalp v2.4s, v27.8h
b.gt 1b
add v0.8h, v0.8h, v18.8h
mul v28.8h, v18.8h, v18.8h
add v0.8h, v0.8h, v19.8h
mul v29.8h, v19.8h, v19.8h
uadalp v1.4s, v28.8h
uadalp v2.4s, v29.8h
b var_end
endfunc
.endm
function var_end
add v1.4s, v1.4s, v2.4s
uaddlv s0, v0.8h
uaddlv d1, v1.4s
mov w0, v0.s[0]
mov x1, v1.d[0]
orr x0, x0, x1, lsl #32
ret
endfunc
.macro SUMSUBL_AB_SVE sum, sub, a, b
add \sum, \a, \b
sub \sub, \a, \b
.endm
function pixel_sa8d_8x8_sve, export=1
ptrue p0.h, vl8
mov x4, x30
bl pixel_sa8d_8x8_sve
add v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
add w0, w0, #1
lsr w0, w0, #1
ret x4
endfunc
.macro sa8d_satd_sve_8x8 satd=
function pixel_sa8d_\satd\()8x8_sve
load_diff_fly_sve_8x8
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
.ifc \satd, satd_
transpose v0.8h, v1.8h, v16.8h, v17.8h
transpose v2.8h, v3.8h, v18.8h, v19.8h
transpose v4.8h, v5.8h, v20.8h, v21.8h
transpose v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
transpose v4.4s, v6.4s, v24.4s, v26.4s
transpose v5.4s, v7.4s, v25.4s, v27.4s
transpose v24.4s, v26.4s, v0.4s, v2.4s
transpose v25.4s, v27.4s, v1.4s, v3.4s
abs v0.8h, v4.8h
abs v1.8h, v5.8h
abs v2.8h, v6.8h
abs v3.8h, v7.8h
abs v4.8h, v24.8h
abs v5.8h, v25.8h
abs v6.8h, v26.8h
abs v7.8h, v27.8h
umax v0.8h, v0.8h, v2.8h
umax v1.8h, v1.8h, v3.8h
umax v2.8h, v4.8h, v6.8h
umax v3.8h, v5.8h, v7.8h
add v26.8h, v0.8h, v1.8h
add v27.8h, v2.8h, v3.8h
.endif
SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
transpose v20.8h, v21.8h, v16.8h, v17.8h
transpose v4.8h, v5.8h, v0.8h, v1.8h
transpose v22.8h, v23.8h, v18.8h, v19.8h
transpose v6.8h, v7.8h, v2.8h, v3.8h
SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
transpose v20.4s, v22.4s, v2.4s, v0.4s
transpose v21.4s, v23.4s, v3.4s, v1.4s
transpose v16.4s, v18.4s, v24.4s, v4.4s
transpose v17.4s, v19.4s, v25.4s, v5.4s
SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
transpose v16.2d, v20.2d, v0.2d, v4.2d
transpose v17.2d, v21.2d, v1.2d, v5.2d
transpose v18.2d, v22.2d, v2.2d, v6.2d
transpose v19.2d, v23.2d, v3.2d, v7.2d
abs v16.8h, v16.8h
abs v20.8h, v20.8h
abs v17.8h, v17.8h
abs v21.8h, v21.8h
abs v18.8h, v18.8h
abs v22.8h, v22.8h
abs v19.8h, v19.8h
abs v23.8h, v23.8h
umax v16.8h, v16.8h, v20.8h
umax v17.8h, v17.8h, v21.8h
umax v18.8h, v18.8h, v22.8h
umax v19.8h, v19.8h, v23.8h
add v0.8h, v16.8h, v17.8h
add v1.8h, v18.8h, v19.8h
ret
endfunc
.endm
.macro HADAMARD_AC_SVE w h
function pixel_hadamard_ac_\w\()x\h\()_sve, export=1
ptrue p0.h, vl8
movrel x5, mask_ac_4_8
mov x4, x30
ld1 {v30.8h,v31.8h}, [x5]
movi v28.16b, #0
movi v29.16b, #0
bl hadamard_ac_8x8_sve
.if \h > 8
bl hadamard_ac_8x8_sve
.endif
.if \w > 8
sub x0, x0, x1, lsl #3
add x0, x0, #8
bl hadamard_ac_8x8_sve
.endif
.if \w * \h == 256
sub x0, x0, x1, lsl #4
bl hadamard_ac_8x8_sve
.endif
addv s1, v29.4s
addv s0, v28.4s
mov w1, v1.s[0]
mov w0, v0.s[0]
lsr w1, w1, #2
lsr w0, w0, #1
orr x0, x0, x1, lsl #32
ret x4
endfunc
.endm
// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
function hadamard_ac_8x8_sve
ld1b {z16.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z17.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z18.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z19.h}, p0/z, [x0]
add x0, x0, x1
SUMSUBL_AB_SVE v0.8h, v1.8h, v16.8h, v17.8h
ld1b {z20.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z21.h}, p0/z, [x0]
add x0, x0, x1
SUMSUBL_AB_SVE v2.8h, v3.8h, v18.8h, v19.8h
ld1b {z22.h}, p0/z, [x0]
add x0, x0, x1
ld1b {z23.h}, p0/z, [x0]
add x0, x0, x1
SUMSUBL_AB_SVE v4.8h, v5.8h, v20.8h, v21.8h
SUMSUBL_AB_SVE v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
transpose v0.8h, v1.8h, v16.8h, v17.8h
transpose v2.8h, v3.8h, v18.8h, v19.8h
transpose v4.8h, v5.8h, v20.8h, v21.8h
transpose v6.8h, v7.8h, v22.8h, v23.8h
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
transpose v0.4s, v2.4s, v16.4s, v18.4s
transpose v1.4s, v3.4s, v17.4s, v19.4s
transpose v4.4s, v6.4s, v20.4s, v22.4s
transpose v5.4s, v7.4s, v21.4s, v23.4s
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
abs v0.8h, v16.8h
abs v4.8h, v20.8h
abs v1.8h, v17.8h
abs v5.8h, v21.8h
abs v2.8h, v18.8h
abs v6.8h, v22.8h
abs v3.8h, v19.8h
abs v7.8h, v23.8h
add v0.8h, v0.8h, v4.8h
add v1.8h, v1.8h, v5.8h
and v0.16b, v0.16b, v30.16b
add v2.8h, v2.8h, v6.8h
add v3.8h, v3.8h, v7.8h
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
uadalp v28.4s, v0.8h
uadalp v28.4s, v1.8h
SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
transpose v16.2d, v17.2d, v6.2d, v7.2d
transpose v18.2d, v19.2d, v4.2d, v5.2d
transpose v20.2d, v21.2d, v2.2d, v3.2d
abs v16.8h, v16.8h
abs v17.8h, v17.8h
abs v18.8h, v18.8h
abs v19.8h, v19.8h
abs v20.8h, v20.8h
abs v21.8h, v21.8h
transpose v7.2d, v6.2d, v1.2d, v0.2d
umax v3.8h, v16.8h, v17.8h
umax v2.8h, v18.8h, v19.8h
umax v1.8h, v20.8h, v21.8h
SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
add v2.8h, v2.8h, v3.8h
add v2.8h, v2.8h, v1.8h
and v4.16b, v4.16b, v31.16b
add v2.8h, v2.8h, v2.8h
abs v5.8h, v5.8h
abs v4.8h, v4.8h
add v2.8h, v2.8h, v5.8h
add v2.8h, v2.8h, v4.8h
uadalp v29.4s, v2.8h
ret
endfunc
SSD_FUNC_SVE 4, 4
SSD_FUNC_SVE 4, 8
SSD_FUNC_SVE 4, 16
SSD_FUNC_SVE 8, 4
SSD_FUNC_SVE 8, 8
pixel_var_sve_8 8
pixel_var_sve_8 16
sa8d_satd_sve_8x8
HADAMARD_AC_SVE 8, 8
HADAMARD_AC_SVE 8, 16
HADAMARD_AC_SVE 16, 8
HADAMARD_AC_SVE 16, 16
#else /* BIT_DEPTH == 10 */
.macro SSD_START_SVE_4
ptrue p0.s, vl4
ld1h {z16.s}, p0/z, [x0]
ld1h {z17.s}, p0/z, [x2]
add x0, x0, x1, lsl #1
add x2, x2, x3, lsl #1
sub v2.4s, v16.4s, v17.4s
ld1h {z16.s}, p0/z, [x0]
ld1h {z17.s}, p0/z, [x2]
add x0, x0, x1, lsl #1
add x2, x2, x3, lsl #1
mul v0.4s, v2.4s, v2.4s
.endm
.macro SSD_SVE_4
sub v2.4s, v16.4s, v17.4s
ld1h {z16.s}, p0/z, [x0]
ld1h {z17.s}, p0/z, [x2]
add x0, x0, x1, lsl #1
add x2, x2, x3, lsl #1
mla v0.4s, v2.4s, v2.4s
.endm
.macro SSD_END_SVE_4
sub v2.4s, v16.4s, v17.4s
mla v0.4s, v2.4s, v2.4s
.endm
.macro SSD_FUNC_SVE w h
function pixel_ssd_\w\()x\h\()_sve, export=1
SSD_START_SVE_\w
.rept \h-2
SSD_SVE_\w
.endr
SSD_END_SVE_\w
addv s0, v0.4s
fmov w0, s0
ret
endfunc
.endm
SSD_FUNC_SVE 4, 4
SSD_FUNC_SVE 4, 8
SSD_FUNC_SVE 4, 16
#endif /* BIT_DEPTH == 8 */

3040
common/aarch64/pixel-a.S Normal file

File diff suppressed because it is too large Load Diff

191
common/aarch64/pixel.h Normal file
View File

@@ -0,0 +1,191 @@
/*****************************************************************************
* pixel.h: aarch64 pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_PIXEL_H
#define X264_AARCH64_PIXEL_H
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
#define x264_pixel_sad_4x16_neon x264_template(pixel_sad_4x16_neon)
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
#define x264_pixel_satd_4x16_neon x264_template(pixel_satd_4x16_neon)
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
#define x264_pixel_ssd_4x16_neon x264_template(pixel_ssd_4x16_neon)
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
#if HAVE_DOTPROD
#define x264_pixel_sad_16x8_neon_dotprod x264_template(pixel_sad_16x8_neon_dotprod)
#define x264_pixel_sad_16x16_neon_dotprod x264_template(pixel_sad_16x16_neon_dotprod)
#define x264_pixel_sad_x3_16x16_neon_dotprod x264_template(pixel_sad_x3_16x16_neon_dotprod)
#define x264_pixel_sad_x3_16x8_neon_dotprod x264_template(pixel_sad_x3_16x8_neon_dotprod)
#define x264_pixel_sad_x4_16x16_neon_dotprod x264_template(pixel_sad_x4_16x16_neon_dotprod)
#define x264_pixel_sad_x4_16x8_neon_dotprod x264_template(pixel_sad_x4_16x8_neon_dotprod)
#define x264_pixel_ssd_16x16_neon_dotprod x264_template(pixel_ssd_16x16_neon_dotprod)
#define x264_pixel_ssd_16x8_neon_dotprod x264_template(pixel_ssd_16x8_neon_dotprod)
#define x264_pixel_ssd_8x16_neon_dotprod x264_template(pixel_ssd_8x16_neon_dotprod)
#define x264_pixel_ssd_8x4_neon_dotprod x264_template(pixel_ssd_8x4_neon_dotprod)
#define x264_pixel_ssd_8x8_neon_dotprod x264_template(pixel_ssd_8x8_neon_dotprod)
#endif // HAVE_DOTPROD
#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
#define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve)
#define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve)
#define DECL_PIXELS( ret, name, suffix, args ) \
ret x264_pixel_##name##_16x16_##suffix args;\
ret x264_pixel_##name##_16x8_##suffix args;\
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
ret x264_pixel_##name##_4x16_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;
#define DECL_PIXELS_SSD_SVE( ret, args ) \
ret x264_pixel_ssd_8x8_sve args;\
ret x264_pixel_ssd_8x4_sve args;\
ret x264_pixel_ssd_4x16_sve args;\
ret x264_pixel_ssd_4x8_sve args;\
ret x264_pixel_ssd_4x4_sve args;
#define DECL_X1( name, suffix ) \
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X1_SSD_SVE( ) \
DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
DECL_X1( sad, neon )
DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
DECL_X1_SSD_SVE( )
#if HAVE_DOTPROD
DECL_X1( sad, neon_dotprod )
DECL_X4( sad, neon_dotprod )
DECL_X1( ssd, neon_dotprod )
#endif // HAVE_DOTPROD
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * );
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
int x264_pixel_vsad_neon( pixel *, intptr_t, int );
#if HAVE_DOTPROD
#define x264_pixel_vsad_neon_dotprod x264_template(pixel_vsad_neon_dotprod)
int x264_pixel_vsad_neon_dotprod( pixel *, intptr_t, int );
#endif // HAVE_DOTPROD
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve)
int x264_pixel_sa8d_8x8_sve ( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
uint64_t x264_pixel_var_8x8_neon ( pixel *, intptr_t );
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
uint64_t x264_pixel_var_8x16_neon ( pixel *, intptr_t );
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t );
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * );
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * );
#define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve)
uint64_t x264_pixel_var_8x8_sve ( pixel *, intptr_t );
#define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve)
uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
uint64_t x264_pixel_hadamard_ac_8x8_neon ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve)
uint64_t x264_pixel_hadamard_ac_8x8_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve)
uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve)
uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve)
uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t );
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t,
const pixel *, intptr_t,
int sums[2][4] );
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
int x264_pixel_asd8_neon( pixel *, intptr_t, pixel *, intptr_t, int );
#endif

908
common/aarch64/predict-a.S Normal file
View File

@@ -0,0 +1,908 @@
/*****************************************************************************
* predict.S: aarch64 intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Mans Rullgard <mans@mansr.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const p8weight, align=4
.short 1, 2, 3, 4, 1, 2, 3, 4
endconst
const p16weight, align=4
.short 1, 2, 3, 4, 5, 6, 7, 8
endconst
.macro ldcol.8 vd, xn, xm, n=8, hi=0
.if \n == 8 || \hi == 0
ld1 {\vd\().b}[0], [\xn], \xm
ld1 {\vd\().b}[1], [\xn], \xm
ld1 {\vd\().b}[2], [\xn], \xm
ld1 {\vd\().b}[3], [\xn], \xm
.endif
.if \n == 8 || \hi == 1
ld1 {\vd\().b}[4], [\xn], \xm
ld1 {\vd\().b}[5], [\xn], \xm
ld1 {\vd\().b}[6], [\xn], \xm
ld1 {\vd\().b}[7], [\xn], \xm
.endif
.endm
.macro ldcol.16 vd, xn, xm
ldcol.8 \vd, \xn, \xm
ld1 {\vd\().b}[ 8], [\xn], \xm
ld1 {\vd\().b}[ 9], [\xn], \xm
ld1 {\vd\().b}[10], [\xn], \xm
ld1 {\vd\().b}[11], [\xn], \xm
ld1 {\vd\().b}[12], [\xn], \xm
ld1 {\vd\().b}[13], [\xn], \xm
ld1 {\vd\().b}[14], [\xn], \xm
ld1 {\vd\().b}[15], [\xn], \xm
.endm
function predict_4x4_h_aarch64, export=1
ldurb w1, [x0, #0*FDEC_STRIDE-1]
mov w5, #0x01010101
ldrb w2, [x0, #1*FDEC_STRIDE-1]
ldrb w3, [x0, #2*FDEC_STRIDE-1]
mul w1, w1, w5
ldrb w4, [x0, #3*FDEC_STRIDE-1]
mul w2, w2, w5
str w1, [x0, #0*FDEC_STRIDE]
mul w3, w3, w5
str w2, [x0, #1*FDEC_STRIDE]
mul w4, w4, w5
str w3, [x0, #2*FDEC_STRIDE]
str w4, [x0, #3*FDEC_STRIDE]
ret
endfunc
function predict_4x4_v_aarch64, export=1
ldur w1, [x0, #0 - 1 * FDEC_STRIDE]
str w1, [x0, #0 + 0 * FDEC_STRIDE]
str w1, [x0, #0 + 1 * FDEC_STRIDE]
str w1, [x0, #0 + 2 * FDEC_STRIDE]
str w1, [x0, #0 + 3 * FDEC_STRIDE]
ret
endfunc
function predict_4x4_dc_neon, export=1
sub x1, x0, #FDEC_STRIDE
ldurb w4, [x0, #-1 + 0 * FDEC_STRIDE]
ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
add w4, w4, w5
ldr s0, [x1]
add w6, w6, w7
uaddlv h0, v0.8b
add w4, w4, w6
dup v0.4h, v0.h[0]
dup v1.4h, w4
add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h, #3
str s0, [x0]
str s0, [x0, #1 * FDEC_STRIDE]
str s0, [x0, #2 * FDEC_STRIDE]
str s0, [x0, #3 * FDEC_STRIDE]
ret
endfunc
function predict_4x4_dc_top_neon, export=1
sub x1, x0, #FDEC_STRIDE
ldr s0, [x1]
uaddlv h0, v0.8b
dup v0.4h, v0.h[0]
rshrn v0.8b, v0.8h, #2
str s0, [x0]
str s0, [x0, #1 * FDEC_STRIDE]
str s0, [x0, #2 * FDEC_STRIDE]
str s0, [x0, #3 * FDEC_STRIDE]
ret
ret
endfunc
function predict_4x4_ddr_neon, export=1
sub x1, x0, #FDEC_STRIDE+1
mov x7, #FDEC_STRIDE
ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
ext v0.8b, v1.8b, v0.8b, #7
ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
ext v0.8b, v2.8b, v0.8b, #7 // a
ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
ext v1.8b, v3.8b, v0.8b, #7 // b
ext v2.8b, v4.8b, v1.8b, #7 // c
uaddl v0.8h, v0.8b, v1.8b
uaddl v1.8h, v1.8b, v2.8b
add v0.8h, v0.8h, v1.8h
rshrn v0.8b, v0.8h, #2
ext v3.8b, v0.8b, v0.8b, #3
ext v2.8b, v0.8b, v0.8b, #2
ext v1.8b, v0.8b, v0.8b, #1
str s3, [x0], #FDEC_STRIDE
str s2, [x0], #FDEC_STRIDE
str s1, [x0], #FDEC_STRIDE
str s0, [x0]
ret
endfunc
function predict_4x4_ddl_neon, export=1
sub x0, x0, #FDEC_STRIDE
mov x7, #FDEC_STRIDE
ld1 {v0.8b}, [x0], x7
dup v3.8b, v0.b[7]
ext v1.8b, v0.8b, v0.8b, #1
ext v2.8b, v0.8b, v3.8b, #2
uhadd v0.8b, v0.8b, v2.8b
urhadd v0.8b, v0.8b, v1.8b
str s0, [x0], #FDEC_STRIDE
ext v1.8b, v0.8b, v0.8b, #1
ext v2.8b, v0.8b, v0.8b, #2
str s1, [x0], #FDEC_STRIDE
ext v3.8b, v0.8b, v0.8b, #3
str s2, [x0], #FDEC_STRIDE
str s3, [x0]
ret
endfunc
function predict_8x8_dc_neon, export=1
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x1], #16
ld1 {v1.8b}, [x1]
ext v0.16b, v0.16b, v0.16b, #7
uaddlv h1, v1.8b
uaddlv h0, v0.8b
add v0.8h, v0.8h, v1.8h
dup v0.8h, v0.h[0]
rshrn v0.8b, v0.8h, #4
.rept 8
st1 {v0.8b}, [x0], x7
.endr
ret
endfunc
function predict_8x8_h_neon, export=1
mov x7, #FDEC_STRIDE
ld1 {v16.16b}, [x1]
dup v0.8b, v16.b[14]
dup v1.8b, v16.b[13]
st1 {v0.8b}, [x0], x7
dup v2.8b, v16.b[12]
st1 {v1.8b}, [x0], x7
dup v3.8b, v16.b[11]
st1 {v2.8b}, [x0], x7
dup v4.8b, v16.b[10]
st1 {v3.8b}, [x0], x7
dup v5.8b, v16.b[9]
st1 {v4.8b}, [x0], x7
dup v6.8b, v16.b[8]
st1 {v5.8b}, [x0], x7
dup v7.8b, v16.b[7]
st1 {v6.8b}, [x0], x7
st1 {v7.8b}, [x0], x7
ret
endfunc
function predict_8x8_v_neon, export=1
add x1, x1, #16
mov x7, #FDEC_STRIDE
ld1 {v0.8b}, [x1]
.rept 8
st1 {v0.8b}, [x0], x7
.endr
ret
endfunc
function predict_8x8_ddl_neon, export=1
add x1, x1, #16
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x1]
movi v3.16b, #0
dup v2.16b, v0.b[15]
ext v4.16b, v3.16b, v0.16b, #15
ext v2.16b, v0.16b, v2.16b, #1
uhadd v4.16b, v4.16b, v2.16b
urhadd v0.16b, v0.16b, v4.16b
ext v1.16b, v0.16b, v0.16b, #1
ext v2.16b, v0.16b, v0.16b, #2
st1 {v1.8b}, [x0], x7
ext v3.16b, v0.16b, v0.16b, #3
st1 {v2.8b}, [x0], x7
ext v4.16b, v0.16b, v0.16b, #4
st1 {v3.8b}, [x0], x7
ext v5.16b, v0.16b, v0.16b, #5
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b, #6
st1 {v5.8b}, [x0], x7
ext v7.16b, v0.16b, v0.16b, #7
st1 {v6.8b}, [x0], x7
ext v0.16b, v0.16b, v0.16b, #8
st1 {v7.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function predict_8x8_ddr_neon, export=1
ld1 {v0.16b,v1.16b}, [x1]
ext v2.16b, v0.16b, v1.16b, #7
ext v4.16b, v0.16b, v1.16b, #9
ext v3.16b, v0.16b, v1.16b, #8
uhadd v2.16b, v2.16b, v4.16b
urhadd v7.16b, v3.16b, v2.16b
add x0, x0, #7*FDEC_STRIDE
mov x7, #-1*FDEC_STRIDE
ext v6.16b, v7.16b, v7.16b, #1
st1 {v7.8b}, [x0], x7
ext v5.16b, v7.16b, v7.16b, #2
st1 {v6.8b}, [x0], x7
ext v4.16b, v7.16b, v7.16b, #3
st1 {v5.8b}, [x0], x7
ext v3.16b, v7.16b, v7.16b, #4
st1 {v4.8b}, [x0], x7
ext v2.16b, v7.16b, v7.16b, #5
st1 {v3.8b}, [x0], x7
ext v1.16b, v7.16b, v7.16b, #6
st1 {v2.8b}, [x0], x7
ext v0.16b, v7.16b, v7.16b, #7
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function predict_8x8_vl_neon, export=1
add x1, x1, #16
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x1]
ext v1.16b, v1.16b, v0.16b, #15
ext v2.16b, v0.16b, v2.16b, #1
uhadd v1.16b, v1.16b, v2.16b
urhadd v3.16b, v0.16b, v2.16b
urhadd v0.16b, v0.16b, v1.16b
ext v4.16b, v0.16b, v0.16b, #1
st1 {v3.8b}, [x0], x7
ext v5.16b, v3.16b, v3.16b, #1
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b, #2
st1 {v5.8b}, [x0], x7
ext v7.16b, v3.16b, v3.16b, #2
st1 {v6.8b}, [x0], x7
ext v4.16b, v0.16b, v0.16b, #3
st1 {v7.8b}, [x0], x7
ext v5.16b, v3.16b, v3.16b, #3
st1 {v4.8b}, [x0], x7
ext v6.16b, v0.16b, v0.16b, #4
st1 {v5.8b}, [x0], x7
st1 {v6.8b}, [x0], x7
ret
endfunc
function predict_8x8_vr_neon, export=1
add x1, x1, #8
mov x7, #FDEC_STRIDE
ld1 {v2.16b}, [x1]
ext v1.16b, v2.16b, v2.16b, #14
ext v0.16b, v2.16b, v2.16b, #15
uhadd v3.16b, v2.16b, v1.16b
urhadd v2.16b, v2.16b, v0.16b
urhadd v0.16b, v0.16b, v3.16b
ext v1.16b, v2.16b, v2.16b, #8
uzp1 v2.8b, v0.8b, v0.8b
uzp2 v3.8b, v0.8b, v0.8b
ext v0.16b, v0.16b, v0.16b, #8
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ext v4.8b, v3.8b, v1.8b, #7
ext v5.8b, v2.8b, v0.8b, #7
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
ext v6.8b, v3.8b, v1.8b, #6
ext v7.8b, v2.8b, v0.8b, #6
st1 {v6.8b}, [x0], x7
st1 {v7.8b}, [x0], x7
ext v1.8b, v3.8b, v1.8b, #5
ext v0.8b, v2.8b, v0.8b, #5
st1 {v1.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
ret
endfunc
function predict_8x8_hd_neon, export=1
add x1, x1, #7
mov x7, #FDEC_STRIDE
ld1 {v1.16b}, [x1]
ext v3.16b, v1.16b, v1.16b, #1
ext v2.16b, v1.16b, v1.16b, #2
urhadd v4.16b, v1.16b, v3.16b
uhadd v1.16b, v1.16b, v2.16b
urhadd v0.16b, v1.16b, v3.16b
zip1 v16.8b, v4.8b, v0.8b
zip2 v17.8b, v4.8b, v0.8b
ext v7.16b, v0.16b, v0.16b, #8
ext v0.8b, v17.8b, v7.8b, #6
ext v1.8b, v17.8b, v7.8b, #4
st1 {v0.8b}, [x0], x7
ext v2.8b, v17.8b, v7.8b, #2
st1 {v1.8b}, [x0], x7
st1 {v2.8b}, [x0], x7
ext v3.8b, v16.8b, v17.8b, #6
st1 {v17.8b}, [x0], x7
ext v4.8b, v16.8b, v17.8b, #4
st1 {v3.8b}, [x0], x7
ext v5.8b, v16.8b, v17.8b, #2
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
st1 {v16.8b}, [x0], x7
ret
endfunc
function predict_8x8_hu_neon, export=1
add x1, x1, #7
mov x7, #FDEC_STRIDE
ld1 {v7.8b}, [x1]
dup v6.8b, v7.b[0]
rev64 v7.8b, v7.8b
ext v4.8b, v7.8b, v6.8b, #2
ext v2.8b, v7.8b, v6.8b, #1
uhadd v5.8b, v7.8b, v4.8b
urhadd v0.8b, v2.8b, v7.8b
urhadd v1.8b, v5.8b, v2.8b
zip1 v16.8b, v0.8b, v1.8b
zip2 v17.8b, v0.8b, v1.8b
dup v18.4h, v17.h[3]
ext v0.8b, v16.8b, v17.8b, #2
ext v1.8b, v16.8b, v17.8b, #4
ext v2.8b, v16.8b, v17.8b, #6
st1 {v16.8b}, [x0], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x0], x7
st1 {v2.8b}, [x0], x7
ext v4.8b, v17.8b, v18.8b, #2
ext v5.8b, v17.8b, v18.8b, #4
ext v6.8b, v17.8b, v18.8b, #6
st1 {v17.8b}, [x0], x7
st1 {v4.8b}, [x0], x7
st1 {v5.8b}, [x0], x7
st1 {v6.8b}, [x0]
ret
endfunc
function predict_8x8c_dc_top_neon, export=1
sub x2, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
ld1 {v0.8b}, [x2]
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v0.8b, v0.8h, #2
dup v3.8b, v0.b[1]
dup v2.8b, v0.b[0]
transpose v0.2s, v1.2s, v2.2s, v3.2s
b pred8x8c_dc_end
endfunc
function predict_8x8c_dc_left_neon, export=1
ldurb w2, [x0, #0 * FDEC_STRIDE - 1]
ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
mov x1, #FDEC_STRIDE
add w2, w2, w3
add w3, w4, w5
ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
add w6, w6, w7
add w7, w8, w9
add w2, w2, w3
add w6, w6, w7
dup v0.8h, w2
dup v1.8h, w6
rshrn v0.8b, v0.8h, #2
rshrn v1.8b, v1.8h, #2
b pred8x8c_dc_end
endfunc
function predict_8x8c_dc_neon, export=1
mov x1, #FDEC_STRIDE
sub x2, x0, #FDEC_STRIDE
ldurb w10, [x0, #0 * FDEC_STRIDE - 1]
ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
add w10, w10, w11
ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
add w12, w12, w13
ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
add w4, w4, w5
add w6, w6, w7
add w10, w10, w12, lsl #16
add w4, w4, w6, lsl #16
ld1 {v0.8b}, [x2]
add x10, x10, x4, lsl #32
uaddlp v0.4h, v0.8b // s0, s1
mov v1.d[0], x10 // s2, s3
add v3.4h, v0.4h, v1.4h
addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
uzp1 v1.2d, v1.2d, v1.2d
uzp1 v0.2d, v0.2d, v0.2d
rshrn v3.8b, v1.8h, #3
rshrn v2.8b, v0.8h, #2
uzp1 v0.8b, v3.8b, v2.8b
uzp2 v1.8b, v2.8b, v3.8b
pred8x8c_dc_end:
add x2, x0, #2 * FDEC_STRIDE
add x4, x0, #4 * FDEC_STRIDE
add x5, x0, #6 * FDEC_STRIDE
st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x2], x1
st1 {v0.8b}, [x0]
st1 {v0.8b}, [x2]
st1 {v1.8b}, [x4], x1
st1 {v1.8b}, [x5], x1
st1 {v1.8b}, [x4]
st1 {v1.8b}, [x5]
ret
endfunc
function predict_8x8c_h_neon, export=1
sub x1, x0, #1
mov x7, #FDEC_STRIDE
.rept 4
ld1r {v0.8b}, [x1], x7
ld1r {v1.8b}, [x1], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x0], x7
.endr
ret
endfunc
function predict_8x8c_v_aarch64, export=1
ldur x1, [x0, #-FDEC_STRIDE]
.irp c, 0,1,2,3,4,5,6,7
str x1, [x0, #\c * FDEC_STRIDE]
.endr
ret
endfunc
function predict_8x8c_p_neon, export=1
sub x3, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
add x2, x3, #4
sub x3, x3, #1
ld1 {v0.s}[0], [x3]
ld1 {v2.s}[0], [x2], x1
ldcol.8 v0, x3, x1, 4, hi=1
add x3, x3, x1
ldcol.8 v3, x3, x1, 4
movrel x4, p8weight
movrel x5, p16weight
uaddl v4.8h, v2.8b, v3.8b
rev32 v0.8b, v0.8b
trn1 v2.2s, v2.2s, v3.2s
ld1 {v7.8h}, [x4]
usubl v2.8h, v2.8b, v0.8b
mul v2.8h, v2.8h, v7.8h
ld1 {v0.8h}, [x5]
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s
shl v3.2s, v2.2s, #4
add v2.2s, v2.2s, v3.2s
rshrn v5.4h, v2.4s, #5 // b, c, x, x
addp v2.4h, v5.4h, v5.4h
shl v3.4h, v2.4h, #2
sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
rev64 v4.4h, v4.4h
add v4.4h, v4.4h, v0.4h
shl v2.4h, v4.4h, #4 // a
sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
ext v0.16b, v0.16b, v0.16b, #14
sub v6.4h, v5.4h, v3.4h
mov v0.h[0], wzr
mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v2.h[0] // pix
dup v2.8h, v5.h[1] // c
add v1.8h, v1.8h, v0.8h // pix + x*b
mov x3, #8
1:
subs x3, x3, #1
sqshrun v0.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
st1 {v0.8b}, [x0], x1
b.ne 1b
ret
endfunc
.macro loadsum4 wd, t1, t2, t3, x, idx
.if \idx == 0
ldurb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
.else
ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
.endif
ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
add \wd, \wd, \t1
add \t1, \t2, \t3
add \wd, \wd, \t1
.endm
function predict_8x16c_h_neon, export=1
sub x2, x0, #1
add x3, x0, #FDEC_STRIDE - 1
mov x7, #2 * FDEC_STRIDE
add x1, x0, #FDEC_STRIDE
.rept 4
ld1r {v0.8b}, [x2], x7
ld1r {v1.8b}, [x3], x7
ld1r {v2.8b}, [x2], x7
ld1r {v3.8b}, [x3], x7
st1 {v0.8b}, [x0], x7
st1 {v1.8b}, [x1], x7
st1 {v2.8b}, [x0], x7
st1 {v3.8b}, [x1], x7
.endr
ret
endfunc
function predict_8x16c_v_neon, export=1
sub x1, x0, #FDEC_STRIDE
mov x2, #2 * FDEC_STRIDE
ld1 {v0.8b}, [x1], x2
.rept 8
st1 {v0.8b}, [x0], x2
st1 {v0.8b}, [x1], x2
.endr
ret
endfunc
function predict_8x16c_p_neon, export=1
movrel x4, p16weight
ld1 {v17.8h}, [x4]
sub x3, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
add x2, x3, #4
sub x3, x3, #1
ld1 {v0.8b}, [x3]
ld1 {v2.8b}, [x2], x1
ldcol.8 v1, x3, x1
add x3, x3, x1
ldcol.8 v3, x3, x1
ext v4.8b, v2.8b, v2.8b, #3
ext v5.8b, v3.8b, v3.8b, #7
rev32 v0.8b, v0.8b
rev64 v1.8b, v1.8b
uaddl v4.8h, v5.8b, v4.8b // a * 1/16
usubl v2.8h, v2.8b, v0.8b
mul v2.8h, v2.8h, v17.8h
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s // H
usubl v3.8h, v3.8b, v1.8b
mul v3.8h, v3.8h, v17.8h
saddlp v3.4s, v3.8h
addp v3.4s, v3.4s, v3.4s
addp v3.4s, v3.4s, v3.4s // V
ext v17.16b, v17.16b, v17.16b, #14
shl v4.4h, v4.4h, #4 // a
shl v6.2s, v2.2s, #4 // 16 * H
shl v7.2s, v3.2s, #2 // 4 * V
add v2.2s, v2.2s, v6.2s // 17 * H
add v3.2s, v3.2s, v7.2s // 5 * V
rshrn v2.4h, v2.4s, #5 // b
rshrn v3.4h, v3.4s, #6 // c
mov v17.h[0], wzr
sub v4.4h, v4.4h, v2.4h // a - b
shl v6.4h, v2.4h, #1 // 2 * b
add v4.4h, v4.4h, v3.4h // a - b + c
shl v7.4h, v3.4h, #3 // 8 * c
sub v4.4h, v4.4h, v6.4h // a - 3b + c
sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v4.h[0] // i00
dup v2.8h, v3.h[0] // c
add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
mov x3, #16
1:
subs x3, x3, #2
sqrshrun v4.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
sqrshrun v5.8b, v1.8h, #5
st1 {v4.8b}, [x0], x1
add v1.8h, v1.8h, v2.8h
st1 {v5.8b}, [x0], x1
b.ne 1b
ret
endfunc
function predict_8x16c_dc_neon, export=1
mov x1, #FDEC_STRIDE
sub x10, x0, #FDEC_STRIDE
loadsum4 w2, w3, w4, w5, x0, 0
ld1 {v6.8b}, [x10]
loadsum4 w6, w7, w8, w9, x0, 4
uaddlp v6.4h, v6.8b
dup v22.8h, w2 // s2
dup v23.8h, w6 // s3
loadsum4 w2, w3, w4, w5, x0, 8
addp v6.4h, v6.4h, v6.4h // s0, s1
loadsum4 w6, w7, w8, w9, x0, 12
dup v20.8h, v6.h[0] // s0
dup v21.8h, v6.h[1] // s1
dup v24.8h, w2 // s4
dup v25.8h, w6 // s5
ext v16.16b, v20.16b, v21.16b, #8
ext v17.16b, v22.16b, v21.16b, #8
ext v1.16b, v23.16b, v21.16b, #8
ext v2.16b, v24.16b, v21.16b, #8
ext v3.16b, v25.16b, v21.16b, #8
add v0.8h, v16.8h, v17.8h
add v1.8h, v1.8h, v23.8h
add v2.8h, v2.8h, v24.8h
add v3.8h, v3.8h, v25.8h
rshrn v0.8b, v0.8h, #3
rshrn v1.8b, v1.8h, #3
rshrn v2.8b, v2.8h, #3
rshrn v3.8b, v3.8h, #3
add x11, x0, #4 * FDEC_STRIDE
add x12, x0, #8 * FDEC_STRIDE
add x13, x0, #12 * FDEC_STRIDE
.rept 4
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x11], x1
st1 {v2.8b}, [x12], x1
st1 {v3.8b}, [x13], x1
.endr
ret
endfunc
function predict_8x16c_dc_left_neon, export=1
mov x1, #FDEC_STRIDE
ldurb w2, [x0, # 0 * FDEC_STRIDE - 1]
ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
add w2, w2, w3
ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
add w4, w4, w5
ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
add w2, w2, w4
ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
dup v0.8h, w2
add w6, w6, w7
rshrn v0.8b, v0.8h, #2
add w8, w8, w9
ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
add w6, w6, w8
ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
dup v1.8h, w6
add w10, w10, w11
rshrn v1.8b, v1.8h, #2
add w12, w12, w13
ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
add w10, w10, w12
ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
dup v2.8h, w10
add w2, w2, w3
rshrn v2.8b, v2.8h, #2
add w4, w4, w5
st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
add w2, w2, w4
st1 {v0.8b}, [x0], x1
dup v3.8h, w2
st1 {v0.8b}, [x0], x1
rshrn v3.8b, v3.8h, #2
.irp idx, 1, 2, 3
.rept 4
st1 {v\idx\().8b}, [x0], x1
.endr
.endr
ret
endfunc
function predict_8x16c_dc_top_neon, export=1
sub x2, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
ld1 {v0.8b}, [x2]
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v4.8b, v0.8h, #2
dup v0.8b, v4.b[0]
dup v1.8b, v4.b[1]
ext v0.8b, v0.8b, v1.8b, #4
.rept 16
st1 {v0.8b}, [x0], x1
.endr
ret
endfunc
function predict_16x16_dc_top_neon, export=1
sub x2, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
ld1 {v0.16b}, [x2]
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h, #4
dup v0.16b, v0.b[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_left_neon, export=1
sub x2, x0, #1
mov x1, #FDEC_STRIDE
ldcol.16 v0, x2, x1
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h, #4
dup v0.16b, v0.b[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_neon, export=1
sub x3, x0, #FDEC_STRIDE
sub x2, x0, #1
mov x1, #FDEC_STRIDE
ld1 {v0.16b}, [x3]
ldcol.16 v1, x2, x1
uaddlv h0, v0.16b
uaddlv h1, v1.16b
add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h, #5
dup v0.16b, v0.b[0]
pred16x16_dc_end:
.rept 16
st1 {v0.16b}, [x0], x1
.endr
ret
endfunc
function predict_16x16_h_neon, export=1
sub x1, x0, #1
mov x7, #FDEC_STRIDE
.rept 8
ld1r {v0.16b}, [x1], x7
ld1r {v1.16b}, [x1], x7
st1 {v0.16b}, [x0], x7
st1 {v1.16b}, [x0], x7
.endr
ret
endfunc
function predict_16x16_v_neon, export=1
sub x0, x0, #FDEC_STRIDE
mov x7, #FDEC_STRIDE
ld1 {v0.16b}, [x0], x7
.rept 16
st1 {v0.16b}, [x0], x7
.endr
ret
endfunc
function predict_16x16_p_neon, export=1
sub x3, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE
add x2, x3, #8
sub x3, x3, #1
ld1 {v0.8b}, [x3]
ld1 {v2.8b}, [x2], x1
ldcol.8 v1, x3, x1
add x3, x3, x1
ldcol.8 v3, x3, x1
rev64 v0.8b, v0.8b
rev64 v1.8b, v1.8b
movrel x4, p16weight
uaddl v4.8h, v2.8b, v3.8b
ld1 {v7.8h}, [x4]
usubl v2.8h, v2.8b, v0.8b
usubl v3.8h, v3.8b, v1.8b
mul v2.8h, v2.8h, v7.8h
mul v3.8h, v3.8h, v7.8h
saddlp v2.4s, v2.8h
saddlp v3.4s, v3.8h
addp v2.4s, v2.4s, v3.4s
addp v2.4s, v2.4s, v2.4s
shl v3.2s, v2.2s, #2
add v2.2s, v2.2s, v3.2s
rshrn v5.4h, v2.4s, #6 // b, c, x, x
addp v2.4h, v5.4h, v5.4h
shl v3.4h, v2.4h, #3
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
ext v4.16b, v4.16b, v4.16b, #14
add v4.4h, v4.4h, v7.4h
shl v2.4h, v4.4h, #4 // a
sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
ext v7.16b, v7.16b, v7.16b, #14
mov v7.h[0], wzr
dup v3.8h, v5.h[0]
mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
dup v1.8h, v2.h[0] // pix
dup v2.8h, v5.h[1] // c
shl v3.8h, v3.8h, #3
add v1.8h, v1.8h, v0.8h // pix + x*b
add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
mov x3, #16
1:
subs x3, x3, #1
sqshrun v0.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
sqshrun2 v0.16b, v3.8h, #5
add v3.8h, v3.8h, v2.8h
st1 {v0.16b}, [x0], x1
b.ne 1b
ret
endfunc

116
common/aarch64/predict-c.c Normal file
View File

@@ -0,0 +1,116 @@
/*****************************************************************************
* predict.c: aarch64 intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "predict.h"
#include "pixel.h"
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_ARMV8 )
{
pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64;
pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64;
}
if( cpu&X264_CPU_NEON )
{
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon;
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon;
}
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_ARMV8 )
{
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_aarch64;
}
if( !(cpu&X264_CPU_NEON) )
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_neon;
pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_neon;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon;
pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_neon;
pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
#endif // !HIGH_BIT_DEPTH
}

119
common/aarch64/predict.h Normal file
View File

@@ -0,0 +1,119 @@
/*****************************************************************************
* predict.h: aarch64 intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_PREDICT_H
#define X264_AARCH64_PREDICT_H
#define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64)
void x264_predict_4x4_h_aarch64( uint8_t *src );
#define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64)
void x264_predict_4x4_v_aarch64( uint8_t *src );
#define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64)
void x264_predict_8x8c_v_aarch64( uint8_t *src );
// for the merged 4x4 intra sad/satd which expects unified suffix
#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
void x264_predict_4x4_dc_top_neon( uint8_t *src );
#define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon)
void x264_predict_4x4_ddr_neon( uint8_t *src );
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
void x264_predict_4x4_ddl_neon( uint8_t *src );
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
void x264_predict_8x8c_p_neon( uint8_t *src );
#define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon)
void x264_predict_8x16c_dc_left_neon( uint8_t *src );
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
void x264_predict_8x16c_p_neon( uint8_t *src );
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
void x264_predict_16x16_dc_top_neon( uint8_t *src );
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
void x264_predict_16x16_dc_left_neon( uint8_t *src );
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
void x264_predict_16x16_p_neon( uint8_t *src );
#define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon)
void x264_predict_4x4_dc_neon( uint8_t *src );
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
void x264_predict_8x8c_dc_neon( uint8_t *src );
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
void x264_predict_8x8c_h_neon( uint8_t *src );
#define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon)
void x264_predict_8x16c_v_neon( uint8_t *src );
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
void x264_predict_8x16c_h_neon( uint8_t *src );
#define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon)
void x264_predict_8x16c_dc_neon( uint8_t *src );
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
void x264_predict_16x16_v_neon( uint8_t *src );
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
void x264_predict_16x16_h_neon( uint8_t *src );
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
void x264_predict_16x16_dc_neon( uint8_t *src );
#define x264_predict_4x4_init_aarch64 x264_template(predict_4x4_init_aarch64)
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] );
#define x264_predict_8x8_init_aarch64 x264_template(predict_8x8_init_aarch64)
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
#define x264_predict_8x8c_init_aarch64 x264_template(predict_8x8c_init_aarch64)
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_8x16c_init_aarch64 x264_template(predict_8x16c_init_aarch64)
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_16x16_init_aarch64 x264_template(predict_16x16_init_aarch64)
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
#endif /* X264_AARCH64_PREDICT_H */

1169
common/aarch64/quant-a.S Normal file

File diff suppressed because it is too large Load Diff

95
common/aarch64/quant.h Normal file
View File

@@ -0,0 +1,95 @@
/*****************************************************************************
* quant.h: arm quantization and level-run
*****************************************************************************
* Copyright (C) 2005-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_AARCH64_QUANT_H
#define X264_AARCH64_QUANT_H
#define x264_quant_2x2_dc_aarch64 x264_template(quant_2x2_dc_aarch64)
int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
int x264_quant_2x2_dc_neon( dctcoef dct[4], int mf, int bias );
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
int x264_quant_4x4_dc_neon( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
int x264_quant_4x4_neon( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
int x264_quant_4x4x4_neon( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
int x264_quant_8x8_neon( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
void x264_dequant_4x4_dc_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
void x264_dequant_4x4_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
void x264_dequant_8x8_neon( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
int x264_decimate_score15_neon( dctcoef * );
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
int x264_decimate_score16_neon( dctcoef * );
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
int x264_decimate_score64_neon( dctcoef * );
// BIT DEPTH = 8
#define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64)
int x264_coeff_last4_aarch64( dctcoef * );
#define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64)
int x264_coeff_last8_aarch64( dctcoef * );
// BIT DEPTH = 10
#define x264_coeff_last4_neon x264_template(coeff_last4_neon)
int x264_coeff_last4_neon( dctcoef * );
#define x264_coeff_last8_neon x264_template(coeff_last8_neon)
int x264_coeff_last8_neon( dctcoef * );
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
int x264_coeff_last15_neon( dctcoef * );
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
int x264_coeff_last16_neon( dctcoef * );
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
int x264_coeff_last64_neon( dctcoef * );
// BIT_DEPTH = 8
#define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64)
int x264_coeff_level_run4_aarch64( dctcoef *, x264_run_level_t * );
// BIT_DEPTH = 10
#define x264_coeff_level_run4_neon x264_template(coeff_level_run4_neon)
int x264_coeff_level_run4_neon( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon)
int x264_coeff_level_run8_neon( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon)
int x264_coeff_level_run15_neon( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon)
int x264_coeff_level_run16_neon( dctcoef *, x264_run_level_t * );
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
#endif

263
common/arm/asm.S Normal file
View File

@@ -0,0 +1,263 @@
/*****************************************************************************
* asm.S: arm utility macros
*****************************************************************************
* Copyright (C) 2008-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "config.h"
.syntax unified
#ifdef __ELF__
.arch armv7-a
.fpu neon
#endif
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#ifdef PREFIX
# define BASE _x264_
# define SYM_PREFIX _
#else
# define BASE x264_
# define SYM_PREFIX
#endif
#ifdef BIT_DEPTH
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
#else
# define EXTERN_ASM BASE
#endif
#define X(s) JOIN(EXTERN_ASM, s)
#define X264(s) JOIN(BASE, s)
#define EXT(s) JOIN(SYM_PREFIX, s)
#ifdef __ELF__
# define ELF
#else
# define ELF @
#endif
#ifdef __MACH__
# define MACH
# define NONMACH @
#else
# define MACH @
# define NONMACH
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
# define FUNC @
#endif
#if SYS_LINUX || SYS_OPENBSD
#define HAVE_SECTION_DATA_REL_RO 1
#else
#define HAVE_SECTION_DATA_REL_RO 0
#endif
.macro require8, val=1
ELF .eabi_attribute 24, \val
.endm
.macro preserve8, val=1
ELF .eabi_attribute 25, \val
.endm
.macro function name, export=1
.macro endfunc
.if \export
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
.else
ELF .size \name, . - \name
.endif
FUNC .endfunc
.purgem endfunc
.endm
.text
.align 2
.if \export == 1
.global EXTERN_ASM\name
ELF .hidden EXTERN_ASM\name
ELF .type EXTERN_ASM\name, %function
FUNC .func EXTERN_ASM\name
EXTERN_ASM\name:
.else
ELF .hidden \name
ELF .type \name, %function
FUNC .func \name
\name:
.endif
.endm
.macro const name, align=2, relocate=0
.macro endconst
ELF .size \name, . - \name
.purgem endconst
.endm
.if HAVE_SECTION_DATA_REL_RO && \relocate
.section .data.rel.ro
.else
NONMACH .section .rodata
MACH .const_data
.endif
.align \align
\name:
.endm
.macro movrel rd, val
#if defined(PIC)
ldr \rd, 1f
b 2f
1:
@ FIXME: thumb
.word \val - (2f + 8)
2:
add \rd, \rd, pc
#elif HAVE_ARMV6T2
movw \rd, #:lower16:\val
movt \rd, #:upper16:\val
#else
ldr \rd, =\val
#endif
.endm
.macro movrelx rd, val, got
#if defined(PIC) && defined(__ELF__)
ldr \got, 2f
ldr \rd, 1f
b 3f
1:
@ FIXME: thumb
.word \val(GOT)
2:
.word _GLOBAL_OFFSET_TABLE_ - (3f + 8)
3:
add \got, \got, pc
ldr \rd, [\got, \rd]
#elif defined(PIC) && defined(__APPLE__)
ldr \rd, 1f
b 2f
1:
@ FIXME: thumb
.word 3f - (2f + 8)
2:
ldr \rd, [pc, \rd]
.non_lazy_symbol_pointer
3:
.indirect_symbol \val
.word 0
.text
#else
movrel \rd, \val
#endif
.endm
.macro movconst rd, val
#if HAVE_ARMV6T2
movw \rd, #:lower16:\val
.if \val >> 16
movt \rd, #:upper16:\val
.endif
#else
ldr \rd, =\val
#endif
.endm
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
.macro HORIZ_ADD dest, a, b
.ifnb \b
vadd.u16 \a, \a, \b
.endif
vpaddl.u16 \a, \a
vpaddl.u32 \dest, \a
.endm
.macro SUMSUB_AB sum, diff, a, b
vadd.s16 \sum, \a, \b
vsub.s16 \diff, \a, \b
.endm
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
SUMSUB_AB \s1, \d1, \a, \b
SUMSUB_AB \s2, \d2, \c, \d
.endm
.macro ABS2 a b
vabs.s16 \a, \a
vabs.s16 \b, \b
.endm
// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
// op = sumsub/amax (sum and diff / maximum of absolutes)
// d1/2 = destination registers
// s1/2 = source registers
.macro HADAMARD dist, op, d1, d2, s1, s2
.if \dist == 1
vtrn.16 \s1, \s2
.else
vtrn.32 \s1, \s2
.endif
.ifc \op, sumsub
SUMSUB_AB \d1, \d2, \s1, \s2
.else
vabs.s16 \s1, \s1
vabs.s16 \s2, \s2
vmax.s16 \d1, \s1, \s2
.endif
.endm
.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
vtrn.32 \r0, \r4
vtrn.32 \r1, \r5
vtrn.32 \r2, \r6
vtrn.32 \r3, \r7
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.16 \r4, \r6
vtrn.16 \r5, \r7
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
vtrn.8 \r4, \r5
vtrn.8 \r6, \r7
.endm
.macro TRANSPOSE4x4 r0 r1 r2 r3
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
.endm
.macro TRANSPOSE4x4_16 d0 d1 d2 d3
vtrn.32 \d0, \d2
vtrn.32 \d1, \d3
vtrn.16 \d0, \d1
vtrn.16 \d2, \d3
.endm

84
common/arm/bitstream-a.S Normal file
View File

@@ -0,0 +1,84 @@
/*****************************************************************************
* bitstream-a.S: arm bitstream functions
*****************************************************************************
* Copyright (C) 2014-2025 x264 project
*
* Authors: Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
function nal_escape_neon
push {r4-r5,lr}
vmov.u8 q0, #0xff
vmov.u8 q8, #4
mov r3, #3
subs lr, r1, r2
beq 99f
0:
cmn lr, #15
blt 16f
mov r1, r2
b 100f
16:
vld1.8 {q1}, [r1]!
vext.8 q2, q0, q1, #14
vext.8 q3, q0, q1, #15
vcgt.u8 q11, q8, q1
vceq.u8 q9, q2, #0
vceq.u8 q10, q3, #0
vand q9, q9, q11
vand q9, q9, q10
vshrn.u16 d22, q9, #4
vmov ip, lr, d22
orrs ip, ip, lr
beq 16f
mov lr, #-16
100:
vmov.u8 r5, d1[6]
vmov.u8 r4, d1[7]
orr r5, r4, r5, lsl #8
101:
ldrb r4, [r1, lr]
orr ip, r4, r5, lsl #16
cmp ip, #3
bhi 102f
strb r3, [r0], #1
orr r5, r3, r5, lsl #8
102:
adds lr, lr, #1
strb r4, [r0], #1
orr r5, r4, r5, lsl #8
blt 101b
subs lr, r1, r2
lsr ip, r5, #8
vmov.u8 d1[6], ip
vmov.u8 d1[7], r5
blt 0b
pop {r4-r5,pc}
16:
subs lr, r1, r2
vst1.8 {q1}, [r0]!
vmov q0, q1
blt 0b
99:
pop {r4-r5,pc}
endfunc

32
common/arm/bitstream.h Normal file
View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* bitstream.h: arm bitstream functions
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_BITSTREAM_H
#define X264_ARM_BITSTREAM_H
#define x264_nal_escape_neon x264_template(nal_escape_neon)
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif

108
common/arm/cpu-a.S Normal file
View File

@@ -0,0 +1,108 @@
/*****************************************************************************
* cpu-a.S: arm cpu detection
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
.align 2
// done in gas because .fpu neon overrides the refusal to assemble
// instructions the selected -march/-mcpu doesn't support
function cpu_neon_test
vadd.i16 q0, q0, q0
bx lr
endfunc
// return: 0 on success
// 1 if counters were already enabled
// 9 if lo-res counters were already enabled
function cpu_enable_armv7_counter, export=0
mrc p15, 0, r2, c9, c12, 0 // read PMNC
ands r0, r2, #1
andne r0, r2, #9
orr r2, r2, #1 // enable counters
bic r2, r2, #8 // full resolution
mcreq p15, 0, r2, c9, c12, 0 // write PMNC
mov r2, #1 << 31 // enable cycle counter
mcr p15, 0, r2, c9, c12, 1 // write CNTENS
bx lr
endfunc
function cpu_disable_armv7_counter, export=0
mrc p15, 0, r0, c9, c12, 0 // read PMNC
bic r0, r0, #1 // disable counters
mcr p15, 0, r0, c9, c12, 0 // write PMNC
bx lr
endfunc
.macro READ_TIME r
mrc p15, 0, \r, c9, c13, 0
.endm
// return: 0 if transfers neon -> arm transfers take more than 10 cycles
// nonzero otherwise
function cpu_fast_neon_mrc_test
// check for user access to performance counters
mrc p15, 0, r0, c9, c14, 0
cmp r0, #0
bxeq lr
push {r4-r6,lr}
bl cpu_enable_armv7_counter
ands r1, r0, #8
mov r3, #0
mov ip, #4
mov r6, #4
moveq r5, #1
movne r5, #64
average_loop:
mov r4, r5
READ_TIME r1
1: subs r4, r4, #1
.rept 8
vmov.u32 lr, d0[0]
add lr, lr, lr
.endr
bgt 1b
READ_TIME r2
subs r6, r6, #1
sub r2, r2, r1
cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles
addle r3, r3, r2
subsle ip, ip, #1
bgt average_loop
// disable counters if we enabled them
ands r0, r0, #1
bleq cpu_disable_armv7_counter
lsr r0, r3, #5
cmp r0, #10
movgt r0, #0
pop {r4-r6,pc}
endfunc

764
common/arm/dct-a.S Normal file
View File

@@ -0,0 +1,764 @@
/****************************************************************************
* dct-a.S: arm transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const scan4x4_frame, align=4
.byte 0,1, 8,9, 2,3, 4,5
.byte 2,3, 8,9, 16,17, 10,11
.byte 12,13, 6,7, 14,15, 20,21
.byte 10,11, 12,13, 6,7, 14,15
endconst
.text
// sum = a + (b>>shift) sub = (a>>shift) - b
.macro SUMSUB_SHR shift sum sub a b t0 t1
vshr.s16 \t0, \b, #\shift
vshr.s16 \t1, \a, #\shift
vadd.s16 \sum, \a, \t0
vsub.s16 \sub, \t1, \b
.endm
// sum = (a>>shift) + b sub = a - (b>>shift)
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
vshr.s16 \t0, \a, #\shift
vshr.s16 \t1, \b, #\shift
vadd.s16 \sum, \t0, \b
vsub.s16 \sub, \a, \t1
.endm
// a += 1.5*ma b -= 1.5*mb
.macro SUMSUB_15 a b ma mb t0 t1
vshr.s16 \t0, \ma, #1
vshr.s16 \t1, \mb, #1
vadd.s16 \t0, \t0, \ma
vadd.s16 \t1, \t1, \mb
vadd.s16 \a, \a, \t0
vsub.s16 \b, \b, \t1
.endm
function dct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
vmov.s16 d31, #1
HADAMARD 1, sumsub, q2, q3, q0, q1
vtrn.32 d4, d5
vadd.s16 d16, d4, d31
vtrn.32 d6, d7
vadd.s16 d17, d6, d31
vrhadd.s16 d0, d4, d5
vhsub.s16 d1, d16, d5
vhsub.s16 d2, d17, d7
vrhadd.s16 d3, d6, d7
vst1.64 {d0-d3}, [r0,:128]
bx lr
endfunc
function idct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
HADAMARD 1, sumsub, q2, q3, q0, q1
HADAMARD 2, sumsub, d0, d1, d4, d5
HADAMARD 2, sumsub, d3, d2, d6, d7
vst1.64 {d0-d3}, [r0,:128]
bx lr
endfunc
.macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7
SUMSUB_AB \d1, \d6, \d5, \d6
SUMSUB_AB \d3, \d7, \d4, \d7
vadd.s16 \d0, \d3, \d1
vadd.s16 \d4, \d7, \d7
vadd.s16 \d5, \d6, \d6
vsub.s16 \d2, \d3, \d1
vadd.s16 \d1, \d4, \d6
vsub.s16 \d3, \d7, \d5
.endm
function sub4x4_dct_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.32 {d0[]}, [r1,:32], r3
vld1.32 {d1[]}, [r2,:32], ip
vld1.32 {d2[]}, [r1,:32], r3
vsubl.u8 q8, d0, d1
vld1.32 {d3[]}, [r2,:32], ip
vld1.32 {d4[]}, [r1,:32], r3
vsubl.u8 q9, d2, d3
vld1.32 {d5[]}, [r2,:32], ip
vld1.32 {d6[]}, [r1,:32], r3
vsubl.u8 q10, d4, d5
vld1.32 {d7[]}, [r2,:32], ip
vsubl.u8 q11, d6, d7
DCT_1D d0, d1, d2, d3, d16, d18, d20, d22
TRANSPOSE4x4_16 d0, d1, d2, d3
DCT_1D d4, d5, d6, d7, d0, d1, d2, d3
vst1.64 {d4-d7}, [r0,:128]
bx lr
endfunc
function sub8x4_dct_neon, export=0
vld1.64 {d0}, [r1,:64], r3
vld1.64 {d1}, [r2,:64], ip
vsubl.u8 q8, d0, d1
vld1.64 {d2}, [r1,:64], r3
vld1.64 {d3}, [r2,:64], ip
vsubl.u8 q9, d2, d3
vld1.64 {d4}, [r1,:64], r3
vld1.64 {d5}, [r2,:64], ip
vsubl.u8 q10, d4, d5
vld1.64 {d6}, [r1,:64], r3
vld1.64 {d7}, [r2,:64], ip
vsubl.u8 q11, d6, d7
DCT_1D q0, q1, q2, q3, q8, q9, q10, q11
TRANSPOSE4x4_16 q0, q1, q2, q3
SUMSUB_AB q8, q12, q0, q3
SUMSUB_AB q9, q10, q1, q2
vadd.i16 q13, q12, q12
vadd.i16 q11, q10, q10
vadd.i16 d0, d16, d18
vadd.i16 d1, d26, d20
vsub.i16 d2, d16, d18
vsub.i16 d3, d24, d22
vst1.64 {d0-d1}, [r0,:128]!
vadd.i16 d4, d17, d19
vadd.i16 d5, d27, d21
vst1.64 {d2-d3}, [r0,:128]!
vsub.i16 d6, d17, d19
vsub.i16 d7, d25, d23
vst1.64 {d4-d5}, [r0,:128]!
vst1.64 {d6-d7}, [r0,:128]!
bx lr
endfunc
function sub8x8_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
bl sub8x4_dct_neon
pop {lr}
b sub8x4_dct_neon
endfunc
function sub16x16_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub r1, r1, #8*FENC_STRIDE-8
sub r2, r2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub r1, r1, #8
sub r2, r2, #8
bl sub8x4_dct_neon
bl sub8x4_dct_neon
sub r1, r1, #8*FENC_STRIDE-8
sub r2, r2, #8*FDEC_STRIDE-8
bl sub8x4_dct_neon
pop {lr}
b sub8x4_dct_neon
endfunc
.macro DCT8_1D type
SUMSUB_AB q2, q1, q11, q12 // s34/d34
SUMSUB_AB q3, q11, q10, q13 // s25/d25
SUMSUB_AB q13, q10, q9, q14 // s16/d16
SUMSUB_AB q14, q8, q8, q15 // s07/d07
SUMSUB_AB q9, q2, q14, q2 // a0/a2
SUMSUB_AB q12, q14, q13, q3 // a1/a3
SUMSUB_AB q3, q13, q8, q1 // a6/a5
vshr.s16 q0, q10, #1
vshr.s16 q15, q11, #1
vadd.s16 q0, q0, q10
vadd.s16 q15, q15, q11
vsub.s16 q3, q3, q0
vsub.s16 q13, q13, q15
SUMSUB_AB q0, q15, q10, q11 // a4/a7
vshr.s16 q10, q8, #1
vshr.s16 q11, q1, #1
vadd.s16 q10, q10, q8
vadd.s16 q11, q11, q1
vadd.s16 q10, q0, q10
vadd.s16 q15, q15, q11
SUMSUB_AB q8, q12, q9, q12
SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1
SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1
SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1
.endm
function sub8x8_dct8_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
vld1.64 {d17}, [r2,:64], ip
vsubl.u8 q8, d16, d17
vld1.64 {d18}, [r1,:64], r3
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
DCT8_1D row
vswp d17, d24 // 8, 12
vswp d21, d28 // 10,14
vtrn.32 q8, q10
vtrn.32 q12, q14
vswp d19, d26 // 9, 13
vswp d23, d30 // 11,15
vtrn.32 q9, q11
vtrn.32 q13, q15
vtrn.16 q10, q11
vtrn.16 q12, q13
vtrn.16 q8, q9
vtrn.16 q14, q15
DCT8_1D col
vst1.64 {d16-d19}, [r0,:128]!
vst1.64 {d20-d23}, [r0,:128]!
vst1.64 {d24-d27}, [r0,:128]!
vst1.64 {d28-d31}, [r0,:128]!
bx lr
endfunc
function sub16x16_dct8_neon
push {lr}
bl X(sub8x8_dct8_neon)
sub r1, r1, #FENC_STRIDE*8 - 8
sub r2, r2, #FDEC_STRIDE*8 - 8
bl X(sub8x8_dct8_neon)
sub r1, r1, #8
sub r2, r2, #8
bl X(sub8x8_dct8_neon)
pop {lr}
sub r1, r1, #FENC_STRIDE*8 - 8
sub r2, r2, #FDEC_STRIDE*8 - 8
b X(sub8x8_dct8_neon)
endfunc
// First part of IDCT (minus final SUMSUB_BA)
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
SUMSUB_AB \d4, \d5, \d0, \d2
vshr.s16 \d7, \d1, #1
vshr.s16 \d6, \d3, #1
vsub.s16 \d7, \d7, \d3
vadd.s16 \d6, \d6, \d1
.endm
function add4x4_idct_neon
mov r2, #FDEC_STRIDE
vld1.64 {d0-d3}, [r1,:128]
IDCT_1D d4, d5, d6, d7, d0, d1, d2, d3
vld1.32 {d30[0]}, [r0,:32], r2
SUMSUB_AB q0, q1, q2, q3
TRANSPOSE4x4_16 d0, d1, d3, d2
IDCT_1D d4, d5, d6, d7, d0, d1, d3, d2
vld1.32 {d30[1]}, [r0,:32], r2
SUMSUB_AB q0, q1, q2, q3
vrshr.s16 q0, q0, #6
vld1.32 {d31[1]}, [r0,:32], r2
vrshr.s16 q1, q1, #6
vld1.32 {d31[0]}, [r0,:32], r2
sub r0, r0, r2, lsl #2
vaddw.u8 q0, q0, d30
vaddw.u8 q1, q1, d31
vqmovun.s16 d0, q0
vqmovun.s16 d2, q1
vst1.32 {d0[0]}, [r0,:32], r2
vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d2[1]}, [r0,:32], r2
vst1.32 {d2[0]}, [r0,:32], r2
bx lr
endfunc
function add8x4_idct_neon, export=0
vld1.64 {d0-d3}, [r1,:128]!
IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3
vld1.64 {d4-d7}, [r1,:128]!
IDCT_1D d17, d19, d21, d23, d4, d5, d6, d7
SUMSUB_AB q0, q3, q8, q10
SUMSUB_AB q1, q2, q9, q11
TRANSPOSE4x4_16 q0, q1, q2, q3
IDCT_1D q8, q9, q10, q11, q0, q1, q2, q3
SUMSUB_AB q0, q3, q8, q10
SUMSUB_AB q1, q2, q9, q11
vrshr.s16 q0, q0, #6
vld1.32 {d28}, [r0,:64], r2
vrshr.s16 q1, q1, #6
vld1.32 {d29}, [r0,:64], r2
vrshr.s16 q2, q2, #6
vld1.32 {d30}, [r0,:64], r2
vrshr.s16 q3, q3, #6
vld1.32 {d31}, [r0,:64], r2
sub r0, r0, r2, lsl #2
vaddw.u8 q0, q0, d28
vaddw.u8 q1, q1, d29
vaddw.u8 q2, q2, d30
vaddw.u8 q3, q3, d31
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.32 {d0}, [r0,:64], r2
vqmovun.s16 d2, q2
vst1.32 {d1}, [r0,:64], r2
vqmovun.s16 d3, q3
vst1.32 {d2}, [r0,:64], r2
vst1.32 {d3}, [r0,:64], r2
bx lr
endfunc
function add8x8_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl add8x4_idct_neon
mov lr, ip
b add8x4_idct_neon
endfunc
function add16x16_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl add8x4_idct_neon
bl add8x4_idct_neon
sub r0, r0, #8*FDEC_STRIDE-8
bl add8x4_idct_neon
bl add8x4_idct_neon
sub r0, r0, #8
bl add8x4_idct_neon
bl add8x4_idct_neon
sub r0, r0, #8*FDEC_STRIDE-8
bl add8x4_idct_neon
mov lr, ip
b add8x4_idct_neon
endfunc
.macro IDCT8_1D type
.ifc \type, col
vswp d21, d28
.endif
SUMSUB_AB q0, q1, q8, q12 // a0/a2
.ifc \type, row
vld1.64 {d28-d31}, [r1,:128]!
.else
vswp d19, d26
.endif
SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4
.ifc \type, col
vswp d23, d30
.endif
SUMSUB_AB q8, q10, q13, q11
SUMSUB_15 q8, q10, q9, q15, q12, q14 // a7/a1
SUMSUB_AB q14, q15, q15, q9
SUMSUB_15 q15, q14, q13, q11, q12, q9 // a5/a3
SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5
SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7
SUMSUB_AB q10, q2, q0, q2 // b0/b6
SUMSUB_AB q11, q3, q1, q3 // b2/b4
SUMSUB_AB q8, q15, q10, q15
SUMSUB_AB q9, q14, q11, q14
SUMSUB_AB q10, q13, q3, q13
.ifc \type, row
vtrn.16 q8, q9
.endif
SUMSUB_AB q11, q12, q2, q12
.endm
function add8x8_idct8_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16-d19}, [r1,:128]!
vld1.64 {d20-d23}, [r1,:128]!
vld1.64 {d24-d27}, [r1,:128]!
IDCT8_1D row
vtrn.16 q10, q11
vtrn.16 q12, q13
vtrn.16 q14, q15
vtrn.32 q8, q10
vtrn.32 q9, q11
vtrn.32 q12, q14
vtrn.32 q13, q15
vswp d17, d24
IDCT8_1D col
vld1.64 {d0}, [r0,:64], r2
vrshr.s16 q8, q8, #6
vld1.64 {d1}, [r0,:64], r2
vrshr.s16 q9, q9, #6
vld1.64 {d2}, [r0,:64], r2
vrshr.s16 q10, q10, #6
vld1.64 {d3}, [r0,:64], r2
vrshr.s16 q11, q11, #6
vld1.64 {d4}, [r0,:64], r2
vrshr.s16 q12, q12, #6
vld1.64 {d5}, [r0,:64], r2
vrshr.s16 q13, q13, #6
vld1.64 {d6}, [r0,:64], r2
vrshr.s16 q14, q14, #6
vld1.64 {d7}, [r0,:64], r2
vrshr.s16 q15, q15, #6
sub r0, r0, r2, lsl #3
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
vqmovun.s16 d0, q8
vqmovun.s16 d1, q9
vqmovun.s16 d2, q10
vaddw.u8 q11, q11, d3
vst1.64 {d0}, [r0,:64], r2
vaddw.u8 q12, q12, d4
vst1.64 {d1}, [r0,:64], r2
vaddw.u8 q13, q13, d5
vst1.64 {d2}, [r0,:64], r2
vqmovun.s16 d3, q11
vqmovun.s16 d4, q12
vaddw.u8 q14, q14, d6
vaddw.u8 q15, q15, d7
vst1.64 {d3}, [r0,:64], r2
vqmovun.s16 d5, q13
vst1.64 {d4}, [r0,:64], r2
vqmovun.s16 d6, q14
vqmovun.s16 d7, q15
vst1.64 {d5}, [r0,:64], r2
vst1.64 {d6}, [r0,:64], r2
vst1.64 {d7}, [r0,:64], r2
bx lr
endfunc
function add16x16_idct8_neon
mov ip, lr
bl X(add8x8_idct8_neon)
sub r0, r0, #8*FDEC_STRIDE-8
bl X(add8x8_idct8_neon)
sub r0, r0, #8
bl X(add8x8_idct8_neon)
sub r0, r0, #8*FDEC_STRIDE-8
mov lr, ip
b X(add8x8_idct8_neon)
endfunc
function add8x8_idct_dc_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64]
vrshr.s16 d16, d16, #6
vld1.64 {d0}, [r0,:64], r2
vmov.i16 q15, #0
vld1.64 {d1}, [r0,:64], r2
vld1.64 {d2}, [r0,:64], r2
vdup.16 d20, d16[0]
vld1.64 {d3}, [r0,:64], r2
vdup.16 d21, d16[1]
vld1.64 {d4}, [r0,:64], r2
vdup.16 d22, d16[2]
vld1.64 {d5}, [r0,:64], r2
vdup.16 d23, d16[3]
vld1.64 {d6}, [r0,:64], r2
vsub.s16 q12, q15, q10
vld1.64 {d7}, [r0,:64], r2
vsub.s16 q13, q15, q11
sub r0, r0, #8*FDEC_STRIDE
vqmovun.s16 d20, q10
vqmovun.s16 d22, q11
vqmovun.s16 d24, q12
vqmovun.s16 d26, q13
vmov d21, d20
vqadd.u8 q0, q0, q10
vmov d23, d22
vqadd.u8 q1, q1, q10
vmov d25, d24
vqadd.u8 q2, q2, q11
vmov d27, d26
vqadd.u8 q3, q3, q11
vqsub.u8 q0, q0, q12
vqsub.u8 q1, q1, q12
vqsub.u8 q2, q2, q13
vst1.64 {d0}, [r0,:64], r2
vqsub.u8 q3, q3, q13
vst1.64 {d1}, [r0,:64], r2
vst1.64 {d2}, [r0,:64], r2
vst1.64 {d3}, [r0,:64], r2
vst1.64 {d4}, [r0,:64], r2
vst1.64 {d5}, [r0,:64], r2
vst1.64 {d6}, [r0,:64], r2
vst1.64 {d7}, [r0,:64], r2
bx lr
endfunc
.macro ADD16x4_IDCT_DC dc
vld1.64 {d16-d17}, [r0,:128], r3
vld1.64 {d18-d19}, [r0,:128], r3
vdup.16 d4, \dc[0]
vdup.16 d5, \dc[1]
vld1.64 {d20-d21}, [r0,:128], r3
vdup.16 d6, \dc[2]
vdup.16 d7, \dc[3]
vld1.64 {d22-d23}, [r0,:128], r3
vsub.s16 q12, q15, q2
vsub.s16 q13, q15, q3
vqmovun.s16 d4, q2
vqmovun.s16 d5, q3
vqmovun.s16 d6, q12
vqmovun.s16 d7, q13
vqadd.u8 q8, q8, q2
vqadd.u8 q9, q9, q2
vqadd.u8 q10, q10, q2
vqadd.u8 q11, q11, q2
vqsub.u8 q8, q8, q3
vqsub.u8 q9, q9, q3
vqsub.u8 q10, q10, q3
vst1.64 {d16-d17}, [r2,:128], r3
vqsub.u8 q11, q11, q3
vst1.64 {d18-d19}, [r2,:128], r3
vst1.64 {d20-d21}, [r2,:128], r3
vst1.64 {d22-d23}, [r2,:128], r3
.endm
function add16x16_idct_dc_neon
mov r2, r0
mov r3, #FDEC_STRIDE
vmov.i16 q15, #0
vld1.64 {d0-d3}, [r1,:64]
vrshr.s16 q0, #6
vrshr.s16 q1, #6
ADD16x4_IDCT_DC d0
ADD16x4_IDCT_DC d1
ADD16x4_IDCT_DC d2
ADD16x4_IDCT_DC d3
bx lr
endfunc
function sub8x8_dct_dc_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
vld1.64 {d17}, [r2,:64], ip
vsubl.u8 q8, d16, d17
vld1.64 {d18}, [r1,:64], r3
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vadd.s16 q0, q8, q9
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vadd.s16 q0, q0, q10
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vadd.s16 q0, q0, q11
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vadd.s16 q1, q12, q13
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
vadd.s16 q1, q1, q14
vadd.s16 d4, d0, d1
vadd.s16 q1, q1, q15
vsub.s16 d5, d0, d1
vadd.s16 d6, d2, d3
vsub.s16 d7, d2, d3
vadd.s16 q0, q2, q3
vsub.s16 q1, q2, q3
vpadd.s16 d0, d0, d2
vpadd.s16 d1, d1, d3
vpadd.s16 d0, d0, d1
vst1.64 {d0}, [r0,:64]
bx lr
endfunc
function sub8x16_dct_dc_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
vld1.64 {d17}, [r2,:64], ip
vsubl.u8 q8, d16, d17
vld1.64 {d18}, [r1,:64], r3
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vadd.s16 q0, q8, q9
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vadd.s16 q0, q0, q10
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vadd.s16 q0, q0, q11
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vadd.s16 q1, q12, q13
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
vld1.64 {d16}, [r1,:64], r3
vadd.s16 q1, q1, q14
vld1.64 {d17}, [r2,:64], ip
vadd.s16 q1, q1, q15
vld1.64 {d18}, [r1,:64], r3
vsubl.u8 q8, d16, d17
vld1.64 {d19}, [r2,:64], ip
vsubl.u8 q9, d18, d19
vld1.64 {d20}, [r1,:64], r3
vld1.64 {d21}, [r2,:64], ip
vsubl.u8 q10, d20, d21
vld1.64 {d22}, [r1,:64], r3
vadd.s16 q2, q8, q9
vld1.64 {d23}, [r2,:64], ip
vsubl.u8 q11, d22, d23
vld1.64 {d24}, [r1,:64], r3
vadd.s16 q2, q2, q10
vld1.64 {d25}, [r2,:64], ip
vsubl.u8 q12, d24, d25
vld1.64 {d26}, [r1,:64], r3
vadd.s16 q2, q2, q11
vld1.64 {d27}, [r2,:64], ip
vsubl.u8 q13, d26, d27
vld1.64 {d28}, [r1,:64], r3
vld1.64 {d29}, [r2,:64], ip
vsubl.u8 q14, d28, d29
vld1.64 {d30}, [r1,:64], r3
vadd.s16 q3, q12, q13
vld1.64 {d31}, [r2,:64], ip
vsubl.u8 q15, d30, d31
vadd.s16 q3, q3, q14
vadd.s16 d16, d0, d1 @ b0
vadd.s16 q3, q3, q15
vsub.s16 d17, d0, d1 @ b4
vadd.s16 d18, d2, d3 @ b1
vsub.s16 d19, d2, d3 @ b5
vadd.s16 d20, d4, d5 @ b2
vsub.s16 d21, d4, d5 @ b6
vadd.s16 d22, d6, d7 @ b3
vsub.s16 d23, d6, d7 @ b7
vadd.s16 q0, q8, q9 @ b0 + b1, b4 + b5; a0, a2
vsub.s16 q1, q8, q9 @ b0 - b1, b4 - b5; a4, a6
vadd.s16 q2, q10, q11 @ b2 + b3, b6 + b7; a1, a3
vsub.s16 q3, q10, q11 @ b2 - b3, b6 - b7; a5, a7
vadd.s16 q8, q0, q2 @ a0 + a1, a2 + a3
vsub.s16 q9, q0, q2 @ a0 - a1, a2 - a3
vsub.s16 q10, q1, q3 @ a4 - a5, a6 - a7
vadd.s16 q11, q1, q3 @ a4 + a5, a6 + a7
vpadd.s16 d0, d16, d17
vpadd.s16 d1, d18, d19
vpadd.s16 d2, d20, d21
vpadd.s16 d3, d22, d23
vpadd.s16 d0, d0, d1
vpadd.s16 d1, d2, d3
vst1.64 {q0}, [r0,:64]
bx lr
endfunc
function zigzag_scan_4x4_frame_neon
movrel r2, scan4x4_frame
vld1.64 {d0-d3}, [r1,:128]
vld1.64 {d16-d19}, [r2,:128]
vtbl.8 d4, {d0-d1}, d16
vtbl.8 d5, {d1-d3}, d17
vtbl.8 d6, {d0-d2}, d18
vtbl.8 d7, {d2-d3}, d19
vst1.64 {d4-d7}, [r0,:128]
bx lr
endfunc

70
common/arm/dct.h Normal file
View File

@@ -0,0 +1,70 @@
/*****************************************************************************
* dct.h: arm transform and zigzag
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_DCT_H
#define X264_ARM_DCT_H
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
void x264_dct4x4dc_neon( int16_t d[16] );
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
void x264_idct4x4dc_neon( int16_t d[16] );
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
#endif

795
common/arm/deblock-a.S Normal file
View File

@@ -0,0 +1,795 @@
/*****************************************************************************
* deblock.S: arm deblocking
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: Mans Rullgard <mans@mansr.com>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
.macro h264_loop_filter_start
ldr ip, [sp]
ldr ip, [ip]
vdup.32 d24, ip
and ip, ip, ip, lsl #16
ands ip, ip, ip, lsl #8
bxlt lr
.endm
.macro align_push_regs
and ip, sp, #15
add ip, ip, #32
sub sp, sp, ip
vst1.64 {d12-d15}, [sp,:128]
sub sp, sp, #32
vst1.64 {d8-d11}, [sp,:128]
.endm
.macro align_pop_regs
vld1.64 {d8-d11}, [sp,:128]!
vld1.64 {d12-d15}, [sp,:128], ip
.endm
.macro h264_loop_filter_luma
vdup.8 q11, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 q6, q8, q0 @ abs(p0 - q0)
vmovl.u16 q12, d24
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
vsli.16 q12, q12, #8
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
vsli.32 q12, q12, #16
vclt.u8 q6, q6, q11 @ < alpha
vdup.8 q11, r3 @ beta
vclt.s8 q7, q12, #0
vclt.u8 q14, q14, q11 @ < beta
vclt.u8 q15, q15, q11 @ < beta
vbic q6, q6, q7
vabd.u8 q4, q10, q8 @ abs(p2 - p0)
vand q6, q6, q14
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
vclt.u8 q4, q4, q11 @ < beta
vand q6, q6, q15
vclt.u8 q5, q5, q11 @ < beta
vand q4, q4, q6
vand q5, q5, q6
vand q12, q12, q6
vrhadd.u8 q14, q8, q0
vsub.i8 q6, q12, q4
vqadd.u8 q7, q9, q12
vhadd.u8 q10, q10, q14
vsub.i8 q6, q6, q5
vhadd.u8 q14, q2, q14
vmin.u8 q7, q7, q10
vqsub.u8 q11, q9, q12
vqadd.u8 q2, q1, q12
vmax.u8 q7, q7, q11
vqsub.u8 q11, q1, q12
vmin.u8 q14, q2, q14
vmovl.u8 q2, d0
vmax.u8 q14, q14, q11
vmovl.u8 q10, d1
vsubw.u8 q2, q2, d16
vsubw.u8 q10, q10, d17
vshl.i16 q2, q2, #2
vshl.i16 q10, q10, #2
vaddw.u8 q2, q2, d18
vaddw.u8 q10, q10, d19
vsubw.u8 q2, q2, d2
vsubw.u8 q10, q10, d3
vrshrn.i16 d4, q2, #3
vrshrn.i16 d5, q10, #3
vbsl q4, q7, q9
vbsl q5, q14, q1
vneg.s8 q7, q6
vmovl.u8 q14, d16
vmin.s8 q2, q2, q6
vmovl.u8 q6, d17
vmax.s8 q2, q2, q7
vmovl.u8 q11, d0
vmovl.u8 q12, d1
vaddw.s8 q14, q14, d4
vaddw.s8 q6, q6, d5
vsubw.s8 q11, q11, d4
vsubw.s8 q12, q12, d5
vqmovun.s16 d16, q14
vqmovun.s16 d17, q6
vqmovun.s16 d0, q11
vqmovun.s16 d1, q12
.endm
function deblock_v_luma_neon
h264_loop_filter_start
vld1.64 {d0, d1}, [r0,:128], r1
vld1.64 {d2, d3}, [r0,:128], r1
vld1.64 {d4, d5}, [r0,:128], r1
sub r0, r0, r1, lsl #2
sub r0, r0, r1, lsl #1
vld1.64 {d20,d21}, [r0,:128], r1
vld1.64 {d18,d19}, [r0,:128], r1
vld1.64 {d16,d17}, [r0,:128], r1
align_push_regs
h264_loop_filter_luma
sub r0, r0, r1, lsl #1
vst1.64 {d8, d9}, [r0,:128], r1
vst1.64 {d16,d17}, [r0,:128], r1
vst1.64 {d0, d1}, [r0,:128], r1
vst1.64 {d10,d11}, [r0,:128]
align_pop_regs
bx lr
endfunc
function deblock_h_luma_neon
h264_loop_filter_start
sub r0, r0, #4
vld1.64 {d6}, [r0], r1
vld1.64 {d20}, [r0], r1
vld1.64 {d18}, [r0], r1
vld1.64 {d16}, [r0], r1
vld1.64 {d0}, [r0], r1
vld1.64 {d2}, [r0], r1
vld1.64 {d4}, [r0], r1
vld1.64 {d26}, [r0], r1
vld1.64 {d7}, [r0], r1
vld1.64 {d21}, [r0], r1
vld1.64 {d19}, [r0], r1
vld1.64 {d17}, [r0], r1
vld1.64 {d1}, [r0], r1
vld1.64 {d3}, [r0], r1
vld1.64 {d5}, [r0], r1
vld1.64 {d27}, [r0], r1
TRANSPOSE8x8 q3, q10, q9, q8, q0, q1, q2, q13
align_push_regs
h264_loop_filter_luma
TRANSPOSE4x4 q4, q8, q0, q5
sub r0, r0, r1, lsl #4
add r0, r0, #2
vst1.32 {d8[0]}, [r0], r1
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d10[0]}, [r0], r1
vst1.32 {d8[1]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d10[1]}, [r0], r1
vst1.32 {d9[0]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d11[0]}, [r0], r1
vst1.32 {d9[1]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
vst1.32 {d11[1]}, [r0], r1
align_pop_regs
bx lr
endfunc
.macro h264_loop_filter_luma_intra
vdup.8 q14, r2 @ alpha
vabd.u8 q4, q8, q0 @ abs(p0 - q0)
vabd.u8 q5, q9, q8 @ abs(p1 - p0)
vabd.u8 q6, q1, q0 @ abs(q1 - q0)
vdup.8 q15, r3 @ beta
vmov.u8 q13, #2
vclt.u8 q7, q4, q14 @ < alpha
vshr.u8 q14, q14, #2 @ alpha >> 2
vclt.u8 q5, q5, q15 @ < beta
vadd.u8 q14, q14, q13 @ (alpha >> 2) + 2
vand q7, q7, q5
vclt.u8 q6, q6, q15 @ < beta
vclt.u8 q13, q4, q14 @ < (alpha >> 2) + 2 if_2
vand q12, q7, q6 @ if_1
vshrn.u16 d28, q12, #4
vmov r2, lr, d28
orrs r2, r2, lr
beq 9f
sub sp, sp, #32
vst1.8 {q12-q13}, [sp,:128]
vshll.u8 q4, d18, #1 @ 2*p1
vshll.u8 q5, d19, #1
vaddw.u8 q4, q4, d16 @ 2*p1 + p0
vaddw.u8 q5, q5, d17
vaddw.u8 q4, q4, d2 @ 2*p1 + p0 + q1
vaddw.u8 q5, q5, d3
vrshrn.u16 d24, q4, #2
vrshrn.u16 d25, q5, #2
vaddl.u8 q6, d20, d16 @ p2 + p0
vaddl.u8 q7, d21, d17
vaddw.u8 q6, q6, d0 @ p2 + p0 + q0
vaddw.u8 q7, q7, d1
vadd.u16 q4, q4, q6 @ p2 + 2*p1 + 2*p0 + q0 + q1
vadd.u16 q5, q5, q7
vaddw.u8 q4, q4, d0 @ p2 + 2*p1 + 2*p0 + 2*q0 + q1
vaddw.u8 q5, q5, d1
vrshrn.u16 d26, q4, #3 @ p0'_2
vrshrn.u16 d27, q5, #3
vaddw.u8 q6, q6, d18 @ p2 + p1 + p0 + q0
vaddw.u8 q7, q7, d19
vrshrn.u16 d28, q6, #2 @ p1'_2
vrshrn.u16 d29, q7, #2
vaddl.u8 q4, d22, d20 @ p3 + p2
vaddl.u8 q5, d23, d21
vshl.u16 q4, q4, #1 @ 2*p3 + 2*p2
vshl.u16 q5, q5, #1
vadd.u16 q4, q4, q6 @ 2*p3 + 3*p2 + p1 + p0 + q0
vadd.u16 q5, q5, q7
vrshrn.u16 d30, q4, #3 @ p2'_2
vrshrn.u16 d31, q5, #3
vdup.8 q4, r3 @ beta
vabd.u8 q5, q10, q8 @ abs(p2 - p0)
vld1.8 {q6-q7}, [sp,:128] @ if_1, if_2
vclt.u8 q5, q5, q4 @ < beta if_3
vand q7, q7, q5 @ if_2 && if_3
vmvn q4, q7
vand q7, q7, q6 @ if_1 && if_2 && if_3
vand q6, q4, q6 @ if_1 && !(if_2 && if_3)
@ copy p0 to q15 so it can be clobbered
vbit q10, q15, q7
vmov q15, q8
vbit q8, q12, q6
@ wait for q9 to clobber
vshll.u8 q4, d2, #1 @ 2*q1
vshll.u8 q5, d3, #1
vbit q8, q12, q6
vaddw.u8 q4, q4, d0 @ 2*q1 + q0
vaddw.u8 q5, q5, d1
vbit q8, q13, q7
vaddw.u8 q4, q4, d18 @ 2*q1 + q0 + p1
vaddw.u8 q5, q5, d19
vbit q9, q14, q7
vrshrn.u16 d24, q4, #2
vrshrn.u16 d25, q5, #2
vaddl.u8 q6, d4, d0 @ q2 + q0
vaddl.u8 q7, d5, d1
vaddw.u8 q6, q6, d30 @ q2 + q0 + p0
vaddw.u8 q7, q7, d31
vadd.u16 q4, q4, q6 @ q2 + 2*q1 + 2*q0 + p0 + p1
vadd.u16 q5, q5, q7
vaddw.u8 q4, q4, d30 @ q2 + 2*q1 + 2*q0 + 2*p0 + p1
vaddw.u8 q5, q5, d31
vrshrn.u16 d26, q4, #3 @ q0'_2
vrshrn.u16 d27, q5, #3
vaddw.u8 q6, q6, d2 @ q2 + q1 + q0 + p0
vaddw.u8 q7, q7, d3
vrshrn.u16 d28, q6, #2 @ q1'_2
vrshrn.u16 d29, q7, #2
vaddl.u8 q4, d6, d4 @ q3 + q2
vaddl.u8 q5, d7, d5
vshl.u16 q4, q4, #1 @ 2*q3 + 2*q2
vshl.u16 q5, q5, #1
vadd.u16 q4, q4, q6 @ 2*q3 + 3*q2 + q1 + q0 + p0
vadd.u16 q5, q5, q7
vrshrn.u16 d30, q4, #3 @ q2'_2
vrshrn.u16 d31, q5, #3
vdup.8 q4, r3 @ beta
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
vld1.8 {q6-q7}, [sp,:128]! @ if_1, if_2
vclt.u8 q5, q5, q4 @ < beta if_4
vand q7, q7, q5 @ if_2 && if_4
vmvn q4, q7
vand q7, q6, q7 @ if_1 && if_2 && if_4
vand q6, q6, q4 @ if_1 && !(if_2 && if_4)
vbit q0, q12, q6
vbit q1, q14, q7
vbit q0, q13, q7
vbit q2, q15, q7
.endm
function deblock_v_luma_intra_neon
push {lr}
vld1.64 {d0, d1}, [r0,:128], r1
vld1.64 {d2, d3}, [r0,:128], r1
vld1.64 {d4, d5}, [r0,:128], r1
vld1.64 {d6, d7}, [r0,:128], r1
sub r0, r0, r1, lsl #3
vld1.64 {d22,d23}, [r0,:128], r1
vld1.64 {d20,d21}, [r0,:128], r1
vld1.64 {d18,d19}, [r0,:128], r1
vld1.64 {d16,d17}, [r0,:128]
align_push_regs
h264_loop_filter_luma_intra
sub r0, r0, r1, lsl #1
vst1.64 {d20,d21}, [r0,:128], r1
vst1.64 {d18,d19}, [r0,:128], r1
vst1.64 {d16,d17}, [r0,:128], r1
vst1.64 {d0, d1}, [r0,:128], r1
vst1.64 {d2, d3}, [r0,:128], r1
vst1.64 {d4, d5}, [r0,:128]
9:
align_pop_regs
pop {pc}
endfunc
function deblock_h_luma_intra_neon
push {lr}
sub r0, r0, #4
vld1.64 {d22}, [r0], r1
vld1.64 {d20}, [r0], r1
vld1.64 {d18}, [r0], r1
vld1.64 {d16}, [r0], r1
vld1.64 {d0}, [r0], r1
vld1.64 {d2}, [r0], r1
vld1.64 {d4}, [r0], r1
vld1.64 {d6}, [r0], r1
vld1.64 {d23}, [r0], r1
vld1.64 {d21}, [r0], r1
vld1.64 {d19}, [r0], r1
vld1.64 {d17}, [r0], r1
vld1.64 {d1}, [r0], r1
vld1.64 {d3}, [r0], r1
vld1.64 {d5}, [r0], r1
vld1.64 {d7}, [r0], r1
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
align_push_regs
h264_loop_filter_luma_intra
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
sub r0, r0, r1, lsl #4
vst1.64 {d22}, [r0], r1
vst1.64 {d20}, [r0], r1
vst1.64 {d18}, [r0], r1
vst1.64 {d16}, [r0], r1
vst1.64 {d0}, [r0], r1
vst1.64 {d2}, [r0], r1
vst1.64 {d4}, [r0], r1
vst1.64 {d6}, [r0], r1
vst1.64 {d23}, [r0], r1
vst1.64 {d21}, [r0], r1
vst1.64 {d19}, [r0], r1
vst1.64 {d17}, [r0], r1
vst1.64 {d1}, [r0], r1
vst1.64 {d3}, [r0], r1
vst1.64 {d5}, [r0], r1
vst1.64 {d7}, [r0], r1
9:
align_pop_regs
pop {pc}
endfunc
.macro h264_loop_filter_chroma
vdup.8 q11, r2 // alpha
vmovl.u8 q12, d24
vabd.u8 q13, q8, q0 // abs(p0 - q0)
vabd.u8 q14, q9, q8 // abs(p1 - p0)
vsubl.u8 q2, d0, d16
vsubl.u8 q3, d1, d17
vsli.16 q12, q12, #8
vshl.i16 q2, q2, #2
vshl.i16 q3, q3, #2
vabd.u8 q15, q1, q0 // abs(q1 - q0)
vmovl.u8 q12, d24
vaddw.u8 q2, q2, d18
vaddw.u8 q3, q3, d19
vclt.u8 q13, q13, q11 // < alpha
vsubw.u8 q2, q2, d2
vsubw.u8 q3, q3, d3
vsli.16 q12, q12, #8
vdup.8 q11, r3 // beta
vclt.s8 q10, q12, #0
vrshrn.i16 d4, q2, #3
vrshrn.i16 d5, q3, #3
vclt.u8 q14, q14, q11 // < beta
vbic q13, q13, q10
vclt.u8 q15, q15, q11 // < beta
vand q13, q13, q14
vneg.s8 q10, q12
vand q13, q13, q15
vmin.s8 q2, q2, q12
vmovl.u8 q14, d16
vand q2, q2, q13
vmovl.u8 q15, d17
vmax.s8 q2, q2, q10
vmovl.u8 q11, d0
vmovl.u8 q12, d1
vaddw.s8 q14, q14, d4
vaddw.s8 q15, q15, d5
vsubw.s8 q11, q11, d4
vsubw.s8 q12, q12, d5
vqmovun.s16 d16, q14
vqmovun.s16 d17, q15
vqmovun.s16 d0, q11
vqmovun.s16 d1, q12
.endm
function deblock_v_chroma_neon
h264_loop_filter_start
sub r0, r0, r1, lsl #1
vld1.8 {d18,d19}, [r0,:128], r1
vld1.8 {d16,d17}, [r0,:128], r1
vld1.8 {d0, d1}, [r0,:128], r1
vld1.8 {d2, d3}, [r0,:128]
h264_loop_filter_chroma
sub r0, r0, r1, lsl #1
vst1.8 {d16,d17}, [r0,:128], r1
vst1.8 {d0, d1}, [r0,:128], r1
bx lr
endfunc
function deblock_h_chroma_neon
h264_loop_filter_start
sub r0, r0, #4
deblock_h_chroma:
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d19}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d3}, [r0], r1
TRANSPOSE4x4_16 q9, q8, q0, q1
h264_loop_filter_chroma
vtrn.16 q8, q0
sub r0, r0, r1, lsl #3
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
bx lr
endfunc
function deblock_h_chroma_422_neon
h264_loop_filter_start
push {lr}
sub r0, r0, #4
add r1, r1, r1
bl deblock_h_chroma
ldr ip, [sp, #4]
ldr ip, [ip]
vdup.32 d24, ip
sub r0, r0, r1, lsl #3
add r0, r0, r1, lsr #1
sub r0, r0, #2
pop {lr}
b deblock_h_chroma
endfunc
.macro h264_loop_filter_chroma8
vdup.8 d22, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 d26, d16, d0 @ abs(p0 - q0)
vabd.u8 d28, d18, d16 @ abs(p1 - p0)
vsubl.u8 q2, d0, d16
vsli.16 d24, d24, #8
vshl.i16 q2, q2, #2
vabd.u8 d30, d2, d0 @ abs(q1 - q0)
vaddw.u8 q2, q2, d18
vclt.u8 d26, d26, d22 @ < alpha
vsubw.u8 q2, q2, d2
vdup.8 d22, r3 @ beta
vclt.s8 d20, d24, #0
vrshrn.i16 d4, q2, #3
vclt.u8 d28, d28, d22 @ < beta
vbic d26, d26, d20
vclt.u8 d30, d30, d22 @ < beta
vand d26, d26, d28
vneg.s8 d20, d24
vand d26, d26, d30
vmin.s8 d4, d4, d24
vmovl.u8 q14, d16
vand d4, d4, d26
vmax.s8 d4, d4, d20
vmovl.u8 q11, d0
vaddw.s8 q14, q14, d4
vsubw.s8 q11, q11, d4
vqmovun.s16 d16, q14
vqmovun.s16 d0, q11
.endm
function deblock_h_chroma_mbaff_neon
h264_loop_filter_start
sub r0, r0, #4
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
TRANSPOSE4x4_16 d18, d16, d0, d2
h264_loop_filter_chroma8
vtrn.16 d16, d0
sub r0, r0, r1, lsl #2
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0]
bx lr
endfunc
.macro h264_loop_filter_chroma_intra, width=16
vdup.8 q11, r2 @ alpha
vabd.u8 q13, q8, q0 @ abs(p0 - q0)
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
vclt.u8 q13, q13, q11 @ < alpha
vdup.8 q11, r3 @ beta
vclt.u8 q14, q14, q11 @ < beta
vclt.u8 q15, q15, q11 @ < beta
vand q13, q13, q14
vand q13, q13, q15
vshll.u8 q14, d18, #1
vshll.u8 q2, d2, #1
.ifc \width, 16
vshll.u8 q15, d19, #1
vshll.u8 q3, d3, #1
vaddl.u8 q12, d17, d3
vaddl.u8 q10, d1, d19
.endif
vaddl.u8 q11, d16, d2
vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1
vadd.u16 q14, q14, q11
vadd.u16 q2, q2, q1
.ifc \width, 16
vadd.u16 q15, q15, q12
vadd.u16 q3, q3, q10
.endif
vqrshrn.u16 d28, q14, #2
vqrshrn.u16 d4, q2, #2
.ifc \width, 16
vqrshrn.u16 d29, q15, #2
vqrshrn.u16 d5, q3, #2
.endif
vbit q8, q14, q13
vbit q0, q2, q13
.endm
function deblock_v_chroma_intra_neon
sub r0, r0, r1, lsl #1
vld2.8 {d18,d19}, [r0,:128], r1
vld2.8 {d16,d17}, [r0,:128], r1
vld2.8 {d0, d1}, [r0,:128], r1
vld2.8 {d2, d3}, [r0,:128]
h264_loop_filter_chroma_intra
sub r0, r0, r1, lsl #1
vst2.8 {d16,d17}, [r0,:128], r1
vst2.8 {d0, d1}, [r0,:128], r1
bx lr
endfunc
function deblock_h_chroma_intra_neon
sub r0, r0, #4
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d19}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d3}, [r0], r1
TRANSPOSE4x4_16 q9, q8, q0, q1
h264_loop_filter_chroma_intra
vtrn.16 q8, q0
sub r0, r0, r1, lsl #3
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
bx lr
endfunc
function deblock_h_chroma_422_intra_neon
push {lr}
bl X(deblock_h_chroma_intra_neon)
add r0, r0, #2
pop {lr}
b X(deblock_h_chroma_intra_neon)
endfunc
function deblock_h_chroma_intra_mbaff_neon
sub r0, r0, #4
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
TRANSPOSE4x4_16 d18, d16, d0, d2
h264_loop_filter_chroma_intra width=8
vtrn.16 d16, d0
sub r0, r0, r1, lsl #2
add r0, r0, #2
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0]
bx lr
endfunc
function deblock_strength_neon
ldr ip, [sp]
vmov.i8 q8, #0
lsl ip, ip, #8
add r3, r3, #32
sub ip, ip, #(1<<8)-3
vmov.i8 q9, #0
vdup.16 q10, ip
ldr ip, [sp, #4]
lists:
@ load bytes ref
vld1.8 {d31}, [r1]!
add r2, r2, #16
vld1.8 {q1}, [r1]!
vmov.i8 q0, #0
vld1.8 {q2}, [r1]!
vext.8 q3, q0, q1, #15
vext.8 q0, q0, q2, #15
vuzp.32 q1, q2
vuzp.32 q3, q0
vext.8 q1, q15, q2, #12
veor q0, q0, q2
veor q1, q1, q2
vorr q8, q8, q0
vorr q9, q9, q1
vld1.16 {q11}, [r2,:128]! @ mv + 0x10
vld1.16 {q3}, [r2,:128]! @ mv + 0x20
vld1.16 {q12}, [r2,:128]! @ mv + 0x30
vld1.16 {q2}, [r2,:128]! @ mv + 0x40
vld1.16 {q13}, [r2,:128]! @ mv + 0x50
vext.8 q3, q3, q12, #12
vext.8 q2, q2, q13, #12
vabd.s16 q0, q12, q3
vld1.16 {q3}, [r2,:128]! @ mv + 0x60
vabd.s16 q1, q13, q2
vld1.16 {q14}, [r2,:128]! @ mv + 0x70
vqmovn.u16 d0, q0
vld1.16 {q2}, [r2,:128]! @ mv + 0x80
vld1.16 {q15}, [r2,:128]! @ mv + 0x90
vqmovn.u16 d1, q1
vext.8 q3, q3, q14, #12
vext.8 q2, q2, q15, #12
vabd.s16 q3, q14, q3
vabd.s16 q2, q15, q2
vqmovn.u16 d2, q3
vqmovn.u16 d3, q2
vqsub.u8 q0, q0, q10
vqsub.u8 q1, q1, q10
vqmovn.u16 d0, q0
vqmovn.u16 d1, q1
vabd.s16 q1, q12, q13
vorr q8, q8, q0
vabd.s16 q0, q11, q12
vabd.s16 q2, q13, q14
vabd.s16 q3, q14, q15
vqmovn.u16 d0, q0
vqmovn.u16 d1, q1
vqmovn.u16 d2, q2
vqmovn.u16 d3, q3
vqsub.u8 q0, q0, q10
vqsub.u8 q1, q1, q10
vqmovn.u16 d0, q0
vqmovn.u16 d1, q1
subs ip, ip, #1
vorr q9, q9, q0
beq lists
mov ip, #-32
@ load bytes nnz
vld1.8 {d31}, [r0]!
vld1.8 {q1}, [r0]!
vmov.i8 q0, #0
vld1.8 {q2}, [r0]
vext.8 q3, q0, q1, #15
vext.8 q0, q0, q2, #15
vuzp.32 q1, q2
vuzp.32 q3, q0
vext.8 q1, q15, q2, #12
vorr q0, q0, q2
vorr q1, q1, q2
vmov.u8 q10, #1
vmin.u8 q0, q0, q10
vmin.u8 q1, q1, q10
vmin.u8 q8, q8, q10 @ mv ? 1 : 0
vmin.u8 q9, q9, q10
vadd.u8 q0, q0, q0 @ nnz ? 2 : 0
vadd.u8 q1, q1, q1
vmax.u8 q8, q8, q0
vmax.u8 q9, q9, q1
vzip.16 d16, d17
vst1.8 {q9}, [r3,:128], ip @ bs[1]
vtrn.8 d16, d17
vtrn.32 d16, d17
vst1.8 {q8}, [r3,:128] @ bs[0]
bx lr
endfunc

58
common/arm/deblock.h Normal file
View File

@@ -0,0 +1,58 @@
/*****************************************************************************
* deblock.h: arm deblocking
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_DEBLOCK_H
#define X264_ARM_DEBLOCK_H
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#endif

1938
common/arm/mc-a.S Normal file

File diff suppressed because it is too large Load Diff

366
common/arm/mc-c.c Normal file
View File

@@ -0,0 +1,366 @@
/*****************************************************************************
* mc-c.c: arm motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "mc.h"
#define x264_prefetch_ref_arm x264_template(prefetch_ref_arm)
void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
#define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm)
void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
void x264_memzero_aligned_neon( void *dst, size_t n );
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
#if !HIGH_BIT_DEPTH
#define MC_WEIGHT(func)\
void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
\
static weight_fn_t mc##func##_wtab_neon[6] =\
{\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w8##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w16##func##_neon,\
x264_mc_weight_w20##func##_neon,\
};
MC_WEIGHT()
MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)
#endif
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon)
void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
#define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon)
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
#define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon)
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
#define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon)
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
void x264_integral_init8v_neon( uint16_t *, intptr_t );
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
#if !HIGH_BIT_DEPTH
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
{
if( w->i_scale == 1<<w->i_denom )
{
if( w->i_offset < 0 )
{
w->weightfn = mc_offsetsub_wtab_neon;
w->cachea[0] = -w->i_offset;
}
else
{
w->weightfn = mc_offsetadd_wtab_neon;
w->cachea[0] = w->i_offset;
}
}
else if( !w->i_denom )
w->weightfn = mc_nodenom_wtab_neon;
else
w->weightfn = mc_wtab_neon;
}
static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_neon,
x264_pixel_avg2_w8_neon,
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
x264_pixel_avg2_w16_neon,
x264_pixel_avg2_w20_neon,
};
static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_neon,
x264_mc_copy_w8_neon,
NULL,
x264_mc_copy_w16_neon,
};
static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
}
else if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
else
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
}
static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
if( (mvy&3) == 3 ) // explicit if() to force conditional add
src1 += i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg_wtab_neon[i_width>>2](
dst, *i_dst_stride, src1, i_src_stride,
src2, i_height );
if( weight->weightfn )
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
return dst;
}
else if( weight->weightfn )
{
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
return dst;
}
else
{
*i_dst_stride = i_src_stride;
return src1;
}
}
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
intptr_t stride, int width, int height, int16_t *buf )
{
intptr_t realign = (intptr_t)src & 15;
src -= realign;
dstv -= realign;
dstc -= realign;
dsth -= realign;
width += realign;
while( height-- )
{
x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
x264_hpel_filter_c_neon( dstc, buf+8, width );
x264_hpel_filter_h_neon( dsth, src, width );
dsth += stride;
dstv += stride;
dstc += stride;
src += stride;
}
}
PLANE_COPY(16, neon)
PLANE_COPY_SWAP(16, neon)
PLANE_INTERLEAVE(neon)
PROPAGATE_LIST(neon)
#endif // !HIGH_BIT_DEPTH
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_ARMV6) )
return;
#if !HIGH_BIT_DEPTH
pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
pf->prefetch_ref = x264_prefetch_ref_arm;
#endif // !HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
pf->plane_copy = plane_copy_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = plane_copy_interleave_neon;
pf->plane_copy_swap = plane_copy_swap_neon;
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
pf->weight = mc_wtab_neon;
pf->offsetadd = mc_offsetadd_wtab_neon;
pf->offsetsub = mc_offsetsub_wtab_neon;
pf->weight_cache = weight_cache_neon;
pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon;
pf->hpel_filter = hpel_filter_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
pf->integral_init4h = x264_integral_init4h_neon;
pf->integral_init8h = x264_integral_init8h_neon;
pf->integral_init4v = x264_integral_init4v_neon;
pf->integral_init8v = x264_integral_init8v_neon;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
#endif // !HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
pf->memcpy_aligned = x264_memcpy_aligned_neon;
#endif
pf->memzero_aligned = x264_memzero_aligned_neon;
}

32
common/arm/mc.h Normal file
View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* mc.h: arm motion compensation
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_MC_H
#define X264_ARM_MC_H
#define x264_mc_init_arm x264_template(mc_init_arm)
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf );
#endif

1535
common/arm/pixel-a.S Normal file

File diff suppressed because it is too large Load Diff

160
common/arm/pixel.h Normal file
View File

@@ -0,0 +1,160 @@
/*****************************************************************************
* pixel.h: arm pixel metrics
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_PIXEL_H
#define X264_ARM_PIXEL_H
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
#define x264_pixel_sad_4x4_armv6 x264_template(pixel_sad_4x4_armv6)
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
#define x264_pixel_sad_4x8_armv6 x264_template(pixel_sad_4x8_armv6)
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
#define x264_pixel_sad_aligned_16x16_neon x264_template(pixel_sad_aligned_16x16_neon)
#define x264_pixel_sad_aligned_16x16_neon_dual x264_template(pixel_sad_aligned_16x16_neon_dual)
#define x264_pixel_sad_aligned_16x8_neon x264_template(pixel_sad_aligned_16x8_neon)
#define x264_pixel_sad_aligned_16x8_neon_dual x264_template(pixel_sad_aligned_16x8_neon_dual)
#define x264_pixel_sad_aligned_4x4_neon x264_template(pixel_sad_aligned_4x4_neon)
#define x264_pixel_sad_aligned_4x8_neon x264_template(pixel_sad_aligned_4x8_neon)
#define x264_pixel_sad_aligned_8x16_neon x264_template(pixel_sad_aligned_8x16_neon)
#define x264_pixel_sad_aligned_8x16_neon_dual x264_template(pixel_sad_aligned_8x16_neon_dual)
#define x264_pixel_sad_aligned_8x4_neon x264_template(pixel_sad_aligned_8x4_neon)
#define x264_pixel_sad_aligned_8x4_neon_dual x264_template(pixel_sad_aligned_8x4_neon_dual)
#define x264_pixel_sad_aligned_8x8_neon x264_template(pixel_sad_aligned_8x8_neon)
#define x264_pixel_sad_aligned_8x8_neon_dual x264_template(pixel_sad_aligned_8x8_neon_dual)
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
#define DECL_PIXELS( ret, name, suffix, args ) \
ret x264_pixel_##name##_16x16_##suffix args;\
ret x264_pixel_##name##_16x8_##suffix args;\
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;\
#define DECL_X1( name, suffix ) \
DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
DECL_X1( sad, neon )
DECL_X1( sad_aligned, neon )
DECL_X1( sad_aligned, neon_dual )
DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
const uint8_t *, intptr_t,
int sums[2][4] );
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#endif

808
common/arm/predict-a.S Normal file
View File

@@ -0,0 +1,808 @@
/*****************************************************************************
* predict.S: arm intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Mans Rullgard <mans@mansr.com>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const p16weight, align=4
.short 1,2,3,4,5,6,7,8
endconst
.text
.macro ldcol.8 rd, rs, rt, n=8, hi=0
.if \n == 8 || \hi == 0
vld1.8 {\rd[0]}, [\rs], \rt
vld1.8 {\rd[1]}, [\rs], \rt
vld1.8 {\rd[2]}, [\rs], \rt
vld1.8 {\rd[3]}, [\rs], \rt
.endif
.if \n == 8 || \hi == 1
vld1.8 {\rd[4]}, [\rs], \rt
vld1.8 {\rd[5]}, [\rs], \rt
vld1.8 {\rd[6]}, [\rs], \rt
vld1.8 {\rd[7]}, [\rs], \rt
.endif
.endm
.macro ldcol.16 rd1, rd2, rs, rt, ru
add \ru, \rs, \rt, lsl #3
vld1.8 {\rd1[0]}, [\rs], \rt
vld1.8 {\rd2[0]}, [\ru], \rt
vld1.8 {\rd1[1]}, [\rs], \rt
vld1.8 {\rd2[1]}, [\ru], \rt
vld1.8 {\rd1[2]}, [\rs], \rt
vld1.8 {\rd2[2]}, [\ru], \rt
vld1.8 {\rd1[3]}, [\rs], \rt
vld1.8 {\rd2[3]}, [\ru], \rt
vld1.8 {\rd1[4]}, [\rs], \rt
vld1.8 {\rd2[4]}, [\ru], \rt
vld1.8 {\rd1[5]}, [\rs], \rt
vld1.8 {\rd2[5]}, [\ru], \rt
vld1.8 {\rd1[6]}, [\rs], \rt
vld1.8 {\rd2[6]}, [\ru], \rt
vld1.8 {\rd1[7]}, [\rs], \rt
vld1.8 {\rd2[7]}, [\ru], \rt
.endm
.macro add16x8 dq, dl, dh, rl, rh
vaddl.u8 \dq, \rl, \rh
vadd.u16 \dl, \dl, \dh
vpadd.u16 \dl, \dl, \dl
vpadd.u16 \dl, \dl, \dl
.endm
// because gcc doesn't believe in using the free shift in add
function predict_4x4_h_armv6
ldrb r1, [r0, #0*FDEC_STRIDE-1]
ldrb r2, [r0, #1*FDEC_STRIDE-1]
ldrb r3, [r0, #2*FDEC_STRIDE-1]
ldrb ip, [r0, #3*FDEC_STRIDE-1]
add r1, r1, r1, lsl #8
add r2, r2, r2, lsl #8
add r3, r3, r3, lsl #8
add ip, ip, ip, lsl #8
add r1, r1, r1, lsl #16
str r1, [r0, #0*FDEC_STRIDE]
add r2, r2, r2, lsl #16
str r2, [r0, #1*FDEC_STRIDE]
add r3, r3, r3, lsl #16
str r3, [r0, #2*FDEC_STRIDE]
add ip, ip, ip, lsl #16
str ip, [r0, #3*FDEC_STRIDE]
bx lr
endfunc
function predict_4x4_v_armv6
ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
str r1, [r0, #0 + 0 * FDEC_STRIDE]
str r1, [r0, #0 + 1 * FDEC_STRIDE]
str r1, [r0, #0 + 2 * FDEC_STRIDE]
str r1, [r0, #0 + 3 * FDEC_STRIDE]
bx lr
endfunc
function predict_4x4_dc_armv6
mov ip, #0
ldr r1, [r0, #-FDEC_STRIDE]
ldrb r2, [r0, #0*FDEC_STRIDE-1]
ldrb r3, [r0, #1*FDEC_STRIDE-1]
usad8 r1, r1, ip
add r2, r2, #4
ldrb ip, [r0, #2*FDEC_STRIDE-1]
add r2, r2, r3
ldrb r3, [r0, #3*FDEC_STRIDE-1]
add r2, r2, ip
add r2, r2, r3
add r1, r1, r2
lsr r1, r1, #3
add r1, r1, r1, lsl #8
add r1, r1, r1, lsl #16
str r1, [r0, #0*FDEC_STRIDE]
str r1, [r0, #1*FDEC_STRIDE]
str r1, [r0, #2*FDEC_STRIDE]
str r1, [r0, #3*FDEC_STRIDE]
bx lr
endfunc
function predict_4x4_dc_top_neon
mov r12, #FDEC_STRIDE
sub r1, r0, #FDEC_STRIDE
vld1.32 d1[], [r1,:32]
vpaddl.u8 d1, d1
vpadd.u16 d1, d1, d1
vrshr.u16 d1, d1, #2
vdup.8 d1, d1[0]
vst1.32 d1[0], [r0,:32], r12
vst1.32 d1[0], [r0,:32], r12
vst1.32 d1[0], [r0,:32], r12
vst1.32 d1[0], [r0,:32], r12
bx lr
endfunc
// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
uhadd8 \a1, \a1, \c1
uhadd8 \a2, \a2, \c2
uhadd8 \c1, \a1, \b1
uhadd8 \c2, \a2, \b2
eor \a1, \a1, \b1
eor \a2, \a2, \b2
and \a1, \a1, \pb_1
and \a2, \a2, \pb_1
uadd8 \a1, \a1, \c1
uadd8 \a2, \a2, \c2
.endm
function predict_4x4_ddr_armv6
ldr r1, [r0, # -FDEC_STRIDE]
ldrb r2, [r0, # -FDEC_STRIDE-1]
ldrb r3, [r0, #0*FDEC_STRIDE-1]
push {r4-r6,lr}
add r2, r2, r1, lsl #8
ldrb r4, [r0, #1*FDEC_STRIDE-1]
add r3, r3, r2, lsl #8
ldrb r5, [r0, #2*FDEC_STRIDE-1]
ldrb r6, [r0, #3*FDEC_STRIDE-1]
add r4, r4, r3, lsl #8
add r5, r5, r4, lsl #8
add r6, r6, r5, lsl #8
ldr ip, =0x01010101
PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
str r1, [r0, #0*FDEC_STRIDE]
lsl r2, r1, #8
lsl r3, r1, #16
lsl r4, r4, #8
lsl r5, r1, #24
add r2, r2, r4, lsr #24
str r2, [r0, #1*FDEC_STRIDE]
add r3, r3, r4, lsr #16
str r3, [r0, #2*FDEC_STRIDE]
add r5, r5, r4, lsr #8
str r5, [r0, #3*FDEC_STRIDE]
pop {r4-r6,pc}
endfunc
function predict_4x4_ddl_neon
sub r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0], ip
vdup.8 d3, d0[7]
vext.8 d1, d0, d0, #1
vext.8 d2, d0, d3, #2
vhadd.u8 d0, d0, d2
vrhadd.u8 d0, d0, d1
vst1.32 {d0[0]}, [r0,:32], ip
vext.8 d1, d0, d0, #1
vext.8 d2, d0, d0, #2
vst1.32 {d1[0]}, [r0,:32], ip
vext.8 d3, d0, d0, #3
vst1.32 {d2[0]}, [r0,:32], ip
vst1.32 {d3[0]}, [r0,:32], ip
bx lr
endfunc
function predict_8x8_dc_neon
mov ip, #0
ldrd r2, r3, [r1, #8]
push {r4-r5,lr}
ldrd r4, r5, [r1, #16]
lsl r3, r3, #8
ldrb lr, [r1, #7]
usad8 r2, r2, ip
usad8 r3, r3, ip
usada8 r2, r4, ip, r2
add lr, lr, #8
usada8 r3, r5, ip, r3
add r2, r2, lr
mov ip, #FDEC_STRIDE
add r2, r2, r3
lsr r2, r2, #4
vdup.8 d0, r2
.rept 8
vst1.64 {d0}, [r0,:64], ip
.endr
pop {r4-r5,pc}
endfunc
function predict_8x8_h_neon
add r1, r1, #7
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1]
vdup.8 d0, d16[7]
vdup.8 d1, d16[6]
vst1.64 {d0}, [r0,:64], ip
vdup.8 d2, d16[5]
vst1.64 {d1}, [r0,:64], ip
vdup.8 d3, d16[4]
vst1.64 {d2}, [r0,:64], ip
vdup.8 d4, d16[3]
vst1.64 {d3}, [r0,:64], ip
vdup.8 d5, d16[2]
vst1.64 {d4}, [r0,:64], ip
vdup.8 d6, d16[1]
vst1.64 {d5}, [r0,:64], ip
vdup.8 d7, d16[0]
vst1.64 {d6}, [r0,:64], ip
vst1.64 {d7}, [r0,:64], ip
bx lr
endfunc
function predict_8x8_v_neon
add r1, r1, #16
mov r12, #FDEC_STRIDE
vld1.8 {d0}, [r1,:64]
.rept 8
vst1.8 {d0}, [r0,:64], r12
.endr
bx lr
endfunc
function predict_8x8_ddl_neon
add r1, #16
vld1.8 {d0, d1}, [r1,:128]
vmov.i8 q3, #0
vrev64.8 d2, d1
vext.8 q8, q3, q0, #15
vext.8 q2, q0, q1, #1
vhadd.u8 q8, q2
mov r12, #FDEC_STRIDE
vrhadd.u8 q0, q8
vext.8 d2, d0, d1, #1
vext.8 d3, d0, d1, #2
vst1.8 d2, [r0,:64], r12
vext.8 d2, d0, d1, #3
vst1.8 d3, [r0,:64], r12
vext.8 d3, d0, d1, #4
vst1.8 d2, [r0,:64], r12
vext.8 d2, d0, d1, #5
vst1.8 d3, [r0,:64], r12
vext.8 d3, d0, d1, #6
vst1.8 d2, [r0,:64], r12
vext.8 d2, d0, d1, #7
vst1.8 d3, [r0,:64], r12
vst1.8 d2, [r0,:64], r12
vst1.8 d1, [r0,:64], r12
bx lr
endfunc
function predict_8x8_ddr_neon
vld1.8 {d0-d3}, [r1,:128]
vext.8 q2, q0, q1, #7
vext.8 q3, q0, q1, #9
vhadd.u8 q2, q2, q3
vrhadd.u8 d0, d1, d4
vrhadd.u8 d1, d2, d5
add r0, #7*FDEC_STRIDE
mov r12, #-1*FDEC_STRIDE
vext.8 d2, d0, d1, #1
vst1.8 {d0}, [r0,:64], r12
vext.8 d4, d0, d1, #2
vst1.8 {d2}, [r0,:64], r12
vext.8 d5, d0, d1, #3
vst1.8 {d4}, [r0,:64], r12
vext.8 d4, d0, d1, #4
vst1.8 {d5}, [r0,:64], r12
vext.8 d5, d0, d1, #5
vst1.8 {d4}, [r0,:64], r12
vext.8 d4, d0, d1, #6
vst1.8 {d5}, [r0,:64], r12
vext.8 d5, d0, d1, #7
vst1.8 {d4}, [r0,:64], r12
vst1.8 {d5}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_vl_neon
add r1, #16
mov r12, #FDEC_STRIDE
vld1.8 {d0, d1}, [r1,:128]
vext.8 q1, q1, q0, #15
vext.8 q2, q0, q2, #1
vrhadd.u8 q3, q0, q2
vhadd.u8 q1, q1, q2
vrhadd.u8 q0, q0, q1
vext.8 d2, d0, d1, #1
vst1.8 {d6}, [r0,:64], r12
vext.8 d3, d6, d7, #1
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #2
vst1.8 {d3}, [r0,:64], r12
vext.8 d3, d6, d7, #2
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #3
vst1.8 {d3}, [r0,:64], r12
vext.8 d3, d6, d7, #3
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #4
vst1.8 {d3}, [r0,:64], r12
vst1.8 {d2}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_vr_neon
add r1, #8
mov r12, #FDEC_STRIDE
vld1.8 {d4,d5}, [r1,:64]
vext.8 q1, q2, q2, #14
vext.8 q0, q2, q2, #15
vhadd.u8 q3, q2, q1
vrhadd.u8 q2, q2, q0
vrhadd.u8 q0, q0, q3
vmov d2, d0
vst1.8 {d5}, [r0,:64], r12
vuzp.8 d2, d0
vst1.8 {d1}, [r0,:64], r12
vext.8 d6, d0, d5, #7
vext.8 d3, d2, d1, #7
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
vext.8 d6, d0, d5, #6
vext.8 d3, d2, d1, #6
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
vext.8 d6, d0, d5, #5
vext.8 d3, d2, d1, #5
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_hd_neon
mov r12, #FDEC_STRIDE
add r1, #7
vld1.8 {d2,d3}, [r1]
vext.8 q3, q1, q1, #1
vext.8 q2, q1, q1, #2
vrhadd.u8 q8, q1, q3
vhadd.u8 q1, q2
vrhadd.u8 q0, q1, q3
vzip.8 d16, d0
vext.8 d2, d0, d1, #6
vext.8 d3, d0, d1, #4
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d0, d1, #2
vst1.8 {d3}, [r0,:64], r12
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d16, d0, #6
vst1.8 {d0}, [r0,:64], r12
vext.8 d3, d16, d0, #4
vst1.8 {d2}, [r0,:64], r12
vext.8 d2, d16, d0, #2
vst1.8 {d3}, [r0,:64], r12
vst1.8 {d2}, [r0,:64], r12
vst1.8 {d16}, [r0,:64], r12
bx lr
endfunc
function predict_8x8_hu_neon
mov r12, #FDEC_STRIDE
add r1, #7
vld1.8 {d7}, [r1]
vdup.8 d6, d7[0]
vrev64.8 d7, d7
vext.8 d4, d7, d6, #2
vext.8 d2, d7, d6, #1
vhadd.u8 d16, d7, d4
vrhadd.u8 d0, d2, d7
vrhadd.u8 d1, d16, d2
vzip.8 d0, d1
vdup.16 q1, d1[3]
vext.8 q2, q0, q1, #2
vext.8 q3, q0, q1, #4
vext.8 q8, q0, q1, #6
vst1.8 {d0}, [r0,:64], r12
vst1.8 {d4}, [r0,:64], r12
vst1.8 {d6}, [r0,:64], r12
vst1.8 {d16}, [r0,:64], r12
vst1.8 {d1}, [r0,:64], r12
vst1.8 {d5}, [r0,:64], r12
vst1.8 {d7}, [r0,:64], r12
vst1.8 {d17}, [r0,:64]
bx lr
endfunc
function predict_8x8c_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {d0}, [r2,:64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
vtrn.32 d0, d1
b pred8x8_dc_end
endfunc
function predict_8x8c_dc_left_neon
mov r1, #FDEC_STRIDE
sub r2, r0, #1
ldcol.8 d0, r2, r1
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
b pred8x8_dc_end
endfunc
function predict_8x8c_dc_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {d0}, [r2,:64]
sub r2, r0, #1
ldcol.8 d1, r2, r1
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d2, q0, #3
vrshrn.u16 d3, q0, #2
vdup.8 d0, d2[4]
vdup.8 d1, d3[3]
vdup.8 d4, d3[2]
vdup.8 d5, d2[5]
vtrn.32 q0, q2
pred8x8_dc_end:
add r2, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
bx lr
endfunc
function predict_8x8c_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 4
vld1.8 {d0[]}, [r1], ip
vld1.8 {d2[]}, [r1], ip
vst1.64 {d0}, [r0,:64], ip
vst1.64 {d2}, [r0,:64], ip
.endr
bx lr
endfunc
function predict_8x8c_v_neon
sub r0, r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0,:64], ip
.rept 8
vst1.64 {d0}, [r0,:64], ip
.endr
bx lr
endfunc
function predict_8x8c_p_neon
sub r3, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
add r2, r3, #4
sub r3, r3, #1
vld1.32 {d0[0]}, [r3]
vld1.32 {d2[0]}, [r2,:32], r1
ldcol.8 d0, r3, r1, 4, hi=1
add r3, r3, r1
ldcol.8 d3, r3, r1, 4
vaddl.u8 q8, d2, d3
vrev32.8 d0, d0
vtrn.32 d2, d3
vsubl.u8 q2, d2, d0
movrel r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 d5, d5, d0
vpadd.i16 d4, d4, d5
vpaddl.s16 d4, d4
vshl.i32 d5, d4, #4
vadd.s32 d4, d4, d5
vrshrn.s32 d4, q2, #5
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #2
vrev64.16 d16, d16
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vext.16 q0, q0, q0, #7
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q3, d5[0]
vadd.i16 q1, q1, q0
mov r3, #8
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function predict_8x16c_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {d0}, [r2,:64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
vtrn.32 d0, d1
add r2, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
add r2, r2, r1, lsl #2
add r0, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
bx lr
endfunc
function predict_8x16c_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 8
vld1.8 {d0[]}, [r1], ip
vld1.8 {d2[]}, [r1], ip
vst1.64 {d0}, [r0,:64], ip
vst1.64 {d2}, [r0,:64], ip
.endr
bx lr
endfunc
function predict_8x16c_p_neon
sub r3, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
add r2, r3, #4
sub r3, r3, #1
vld1.32 {d0[0]}, [r3]
vld1.32 {d2[0]}, [r2,:32], r1
ldcol.8 d1, r3, r1
add r3, r3, r1
ldcol.8 d3, r3, r1
vrev64.32 d16, d3
vaddl.u8 q8, d2, d16
vrev32.8 d0, d0
vsubl.u8 q2, d2, d0
vrev64.8 d1, d1
vsubl.u8 q3, d3, d1
movrel r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 q3, q3, q0
vpadd.i16 d4, d4, d5
vpadd.i16 d6, d6, d7
vpaddl.s16 d4, d4 @ d4[0] = H
vpaddl.s16 d6, d6
vpadd.s32 d6, d6 @ d6[0] = V
vshl.i32 d5, d4, #4
vadd.s32 d4, d4, d5 @ d4[0] = 17*H
vshl.i32 d7, d6, #2
vrshrn.s32 d4, q2, #5 @ d4[0] = b
vadd.s32 d6, d6, d7 @ d6[0] = 5*V
vrshrn.s32 d6, q3, #6 @ d6[0] = c
mov r3, #0
vshl.i16 d3, d4, #2
vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
vshl.i16 d2, d6, #3
vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
vrev64.16 d16, d16
vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
vshl.i16 d2, d16, #4 @ d3[0] = a + 16
vsub.i16 d2, d2, d3 @ i00
vext.16 q0, q0, q0, #7
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q3, d6[0]
vadd.i16 q1, q1, q0
mov r3, #16
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function predict_16x16_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {q0}, [r2,:128]
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_left_neon
mov r1, #FDEC_STRIDE
sub r2, r0, #1
ldcol.8 d0, r2, r1
ldcol.8 d1, r2, r1
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b pred16x16_dc_end
endfunc
function predict_16x16_dc_neon
sub r3, r0, #FDEC_STRIDE
sub r0, r0, #1
vld1.64 {d0-d1}, [r3,:128]
ldrb ip, [r0], #FDEC_STRIDE
vaddl.u8 q0, d0, d1
ldrb r1, [r0], #FDEC_STRIDE
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0, d0
vpadd.u16 d0, d0, d0
.rept 4
ldrb r2, [r0], #FDEC_STRIDE
add ip, ip, r1
ldrb r3, [r0], #FDEC_STRIDE
add ip, ip, r2
ldrb r1, [r0], #FDEC_STRIDE
add ip, ip, r3
.endr
ldrb r2, [r0], #FDEC_STRIDE
add ip, ip, r1
ldrb r3, [r0], #FDEC_STRIDE
add ip, ip, r2
sub r0, r0, #FDEC_STRIDE*16
add ip, ip, r3
vdup.16 d1, ip
vadd.u16 d0, d0, d1
mov r1, #FDEC_STRIDE
add r0, r0, #1
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
pred16x16_dc_end:
.rept 16
vst1.64 {d0-d1}, [r0,:128], r1
.endr
bx lr
endfunc
function predict_16x16_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 8
vld1.8 {d0[]}, [r1], ip
vmov d1, d0
vld1.8 {d2[]}, [r1], ip
vmov d3, d2
vst1.64 {d0-d1}, [r0,:128], ip
vst1.64 {d2-d3}, [r0,:128], ip
.endr
bx lr
endfunc
function predict_16x16_v_neon
sub r0, r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0-d1}, [r0,:128], ip
.rept 16
vst1.64 {d0-d1}, [r0,:128], ip
.endr
bx lr
endfunc
function predict_16x16_p_neon
sub r3, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
add r2, r3, #8
sub r3, r3, #1
vld1.8 {d0}, [r3]
vld1.8 {d2}, [r2,:64], r1
ldcol.8 d1, r3, r1
add r3, r3, r1
ldcol.8 d3, r3, r1
vrev64.8 q0, q0
vaddl.u8 q8, d2, d3
vsubl.u8 q2, d2, d0
vsubl.u8 q3, d3, d1
movrel r3, p16weight
vld1.8 {q0}, [r3,:128]
vmul.s16 q2, q2, q0
vmul.s16 q3, q3, q0
vadd.i16 d4, d4, d5
vadd.i16 d5, d6, d7
vpadd.i16 d4, d4, d5
vpadd.i16 d4, d4, d4
vshll.s16 q3, d4, #2
vaddw.s16 q2, q3, d4
vrshrn.s32 d4, q2, #6
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #3
vrev64.16 d16, d17
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vshl.i16 d3, d4, #4
vext.16 q0, q0, q0, #7
vsub.i16 d6, d5, d3
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q2, d4[0]
vdup.16 q3, d6[0]
vshl.i16 q2, q2, #3
vadd.i16 q1, q1, q0
vadd.i16 q3, q3, q2
mov r3, #16
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q2
vqshrun.s16 d1, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc

108
common/arm/predict-c.c Normal file
View File

@@ -0,0 +1,108 @@
/*****************************************************************************
* predict.c: arm intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "predict.h"
#include "pixel.h"
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] )
{
if( !(cpu&X264_CPU_ARMV6) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6;
pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6;
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
if( !(cpu&X264_CPU_NEON) )
return;
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
/* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_NEON) )
return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
#endif // !HIGH_BIT_DEPTH
}

105
common/arm/predict.h Normal file
View File

@@ -0,0 +1,105 @@
/*****************************************************************************
* predict.h: arm intra prediction
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_PREDICT_H
#define X264_ARM_PREDICT_H
#define x264_predict_4x4_dc_armv6 x264_template(predict_4x4_dc_armv6)
void x264_predict_4x4_dc_armv6( uint8_t *src );
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
void x264_predict_4x4_dc_top_neon( uint8_t *src );
#define x264_predict_4x4_v_armv6 x264_template(predict_4x4_v_armv6)
void x264_predict_4x4_v_armv6( uint8_t *src );
#define x264_predict_4x4_h_armv6 x264_template(predict_4x4_h_armv6)
void x264_predict_4x4_h_armv6( uint8_t *src );
#define x264_predict_4x4_ddr_armv6 x264_template(predict_4x4_ddr_armv6)
void x264_predict_4x4_ddr_armv6( uint8_t *src );
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
void x264_predict_4x4_ddl_neon( uint8_t *src );
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
void x264_predict_8x8c_dc_neon( uint8_t *src );
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
void x264_predict_8x8c_h_neon( uint8_t *src );
#define x264_predict_8x8c_v_neon x264_template(predict_8x8c_v_neon)
void x264_predict_8x8c_v_neon( uint8_t *src );
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
void x264_predict_8x8c_p_neon( uint8_t *src );
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
void x264_predict_8x16c_h_neon( uint8_t *src );
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
void x264_predict_8x16c_p_neon( uint8_t *src );
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
void x264_predict_16x16_dc_neon( uint8_t *src );
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
void x264_predict_16x16_dc_top_neon( uint8_t *src );
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
void x264_predict_16x16_dc_left_neon( uint8_t *src );
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
void x264_predict_16x16_h_neon( uint8_t *src );
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
void x264_predict_16x16_v_neon( uint8_t *src );
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
void x264_predict_16x16_p_neon( uint8_t *src );
#define x264_predict_4x4_init_arm x264_template(predict_4x4_init_arm)
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] );
#define x264_predict_8x8_init_arm x264_template(predict_8x8_init_arm)
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
#define x264_predict_8x8c_init_arm x264_template(predict_8x8c_init_arm)
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_8x16c_init_arm x264_template(predict_8x16c_init_arm)
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_16x16_init_arm x264_template(predict_16x16_init_arm)
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] );
#endif

574
common/arm/quant-a.S Normal file
View File

@@ -0,0 +1,574 @@
/****************************************************************************
* quant.S: arm quantization and level-run
*****************************************************************************
* Copyright (C) 2009-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "asm.S"
const pmovmskb_byte, align=4
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128
endconst
const mask_2bit, align=4
.byte 3,12,48,192,3,12,48,192
.byte 3,12,48,192,3,12,48,192
endconst
const mask_1bit, align=4
.byte 128,64,32,16,8,4,2,1
.byte 128,64,32,16,8,4,2,1
endconst
.text
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
vadd.u16 q8, q8, \bias0
vadd.u16 q9, q9, \bias1
.ifc \load_mf, yes
vld1.64 {\mf0-\mf3}, [r1,:128]!
.endif
vmull.u16 q10, d16, \mf0
vmull.u16 q11, d17, \mf1
vmull.u16 q12, d18, \mf2
vmull.u16 q13, d19, \mf3
vshr.s16 q14, q14, #15
vshr.s16 q15, q15, #15
vshrn.u32 d16, q10, #16
vshrn.u32 d17, q11, #16
vshrn.u32 d18, q12, #16
vshrn.u32 d19, q13, #16
veor q8, q8, q14
veor q9, q9, q15
vsub.s16 q8, q8, q14
vsub.s16 q9, q9, q15
vorr \mask, q8, q9
vst1.64 {d16-d19}, [r0,:128]!
.endm
.macro QUANT_END d
vmov r2, r3, \d
orrs r0, r2, r3
movne r0, #1
bx lr
.endm
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
function quant_2x2_dc_neon
vld1.64 {d0}, [r0,:64]
vabs.s16 d3, d0
vdup.16 d2, r2
vdup.16 d1, r1
vadd.u16 d3, d3, d2
vmull.u16 q3, d3, d1
vshr.s16 d0, d0, #15
vshrn.u32 d3, q3, #16
veor d3, d3, d0
vsub.s16 d3, d3, d0
vst1.64 {d3}, [r0,:64]
QUANT_END d3
endfunc
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
function quant_4x4_dc_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vdup.16 q0, r2
vdup.16 q2, r1
QUANT_TWO q0, q0, d4, d5, d4, d5, q0
vorr d0, d0, d1
QUANT_END d0
endfunc
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
function quant_4x4_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]
vld1.64 {d4-d7}, [r1,:128]
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
vorr d0, d0, d1
QUANT_END d0
endfunc
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
function quant_4x4x4_neon
vpush {d8-d15}
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]
vld1.64 {d4-d7}, [r1,:128]
QUANT_TWO q0, q1, d4, d5, d6, d7, q4
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q5
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q6
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
QUANT_TWO q0, q1, d4, d5, d6, d7, q7
vorr d8, d8, d9
vorr d10, d10, d11
vorr d12, d12, d13
vorr d14, d14, d15
vmov r0, r1, d8
vmov r2, r3, d10
orrs r0, r1
movne r0, #1
orrs r2, r3
orrne r0, #2
vmov r1, r2, d12
vmov r3, ip, d14
orrs r1, r2
orrne r0, #4
orrs r3, ip
orrne r0, #8
vpop {d8-d15}
bx lr
endfunc
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
function quant_8x8_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d0-d3}, [r2,:128]!
vld1.64 {d4-d7}, [r1,:128]!
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
.rept 3
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
vld1.64 {d2-d5}, [r2,:128]!
QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
vorr q0, q0, q1
.endr
vorr d0, d0, d1
QUANT_END d0
endfunc
.macro DEQUANT_START mf_size offset dc=no
mov r3, #0x2b
mul r3, r3, r2
lsr r3, r3, #8 // i_qbits = i_qp / 6
add ip, r3, r3, lsl #1
sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
.ifc \dc,no
add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
.else
ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
.endif
subs r3, r3, #\offset // 6 for 8x8
.endm
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
.macro DEQUANT size bits
function dequant_\size\()_neon
DEQUANT_START \bits+2, \bits
.ifc \size, 8x8
mov r2, #4
.endif
blt dequant_\size\()_rshift
vdup.16 q15, r3
dequant_\size\()_lshift_loop:
.ifc \size, 8x8
subs r2, r2, #1
.endif
vld1.32 {d16-d17}, [r1,:128]!
vld1.32 {d18-d19}, [r1,:128]!
vmovn.s32 d4, q8
vld1.32 {d20-d21}, [r1,:128]!
vmovn.s32 d5, q9
vld1.32 {d22-d23}, [r1,:128]!
vmovn.s32 d6, q10
vld1.16 {d0-d3}, [r0,:128]
vmovn.s32 d7, q11
vmul.s16 q0, q0, q2
vmul.s16 q1, q1, q3
vshl.s16 q0, q0, q15
vshl.s16 q1, q1, q15
vst1.16 {d0-d3}, [r0,:128]!
.ifc \size, 8x8
bgt dequant_\size\()_lshift_loop
.endif
bx lr
dequant_\size\()_rshift:
vdup.32 q15, r3
rsb r3, r3, #0
mov ip, #1
sub r3, r3, #1
lsl ip, ip, r3
.ifc \size, 8x8
dequant_\size\()_rshift_loop:
subs r2, r2, #1
.endif
vdup.32 q10, ip
vld1.32 {d16-d17}, [r1,:128]!
vdup.32 q11, ip
vld1.32 {d18-d19}, [r1,:128]!
vmovn.s32 d4, q8
vld1.32 {d16-d17}, [r1,:128]!
vmovn.s32 d5, q9
vld1.32 {d18-d19}, [r1,:128]!
vmovn.s32 d6, q8
vld1.16 {d0-d3}, [r0,:128]
vmovn.s32 d7, q9
vdup.32 q12, ip
vdup.32 q13, ip
vmlal.s16 q10, d0, d4
vmlal.s16 q11, d1, d5
vmlal.s16 q12, d2, d6
vmlal.s16 q13, d3, d7
vshl.s32 q10, q10, q15
vshl.s32 q11, q11, q15
vshl.s32 q12, q12, q15
vshl.s32 q13, q13, q15
vmovn.s32 d0, q10
vmovn.s32 d1, q11
vmovn.s32 d2, q12
vmovn.s32 d3, q13
vst1.16 {d0-d3}, [r0,:128]!
.ifc \size, 8x8
bgt dequant_\size\()_rshift_loop
.endif
bx lr
endfunc
.endm
DEQUANT 4x4, 4
DEQUANT 8x8, 6
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
function dequant_4x4_dc_neon
DEQUANT_START 6, 6, yes
blt dequant_4x4_dc_rshift
lsl r1, r1, r3
vdup.16 q2, r1
vld1.16 {d0-d3}, [r0,:128]
vdup.16 q15, r3
vmul.s16 q0, q0, q2
vmul.s16 q1, q1, q2
vst1.16 {d0-d3}, [r0,:128]
bx lr
dequant_4x4_dc_rshift:
vdup.16 d4, r1
vdup.32 q15, r3
rsb r3, r3, #0
mov ip, #1
sub r3, r3, #1
lsl ip, ip, r3
vdup.32 q10, ip
vdup.32 q11, ip
vld1.16 {d0-d3}, [r0,:128]
vdup.32 q12, ip
vdup.32 q13, ip
vmlal.s16 q10, d0, d4
vmlal.s16 q11, d1, d4
vmlal.s16 q12, d2, d4
vmlal.s16 q13, d3, d4
vshl.s32 q10, q10, q15
vshl.s32 q11, q11, q15
vshl.s32 q12, q12, q15
vshl.s32 q13, q13, q15
vmovn.s32 d0, q10
vmovn.s32 d1, q11
vmovn.s32 d2, q12
vmovn.s32 d3, q13
vst1.16 {d0-d3}, [r0,:128]
bx lr
endfunc
.macro decimate_score_1x size
function decimate_score\size\()_neon
vld1.16 {q0, q1}, [r0, :128]
movrel r3, mask_2bit
vmov.s8 q3, #0x01
vqmovn.s16 d0, q0
vqmovn.s16 d1, q1
vqabs.s8 q2, q0
vld1.8 {q8}, [r3, :128]
vceq.s8 q1, q0, #0
vcgt.s8 q2, q2, q3
vand.u8 q1, q1, q8
vshrn.u16 d4, q2, #4
vpadd.u8 d2, d2, d3
vpadd.u8 d4, d4, d4
vpadd.u8 d2, d2, d2
vmov.32 r2, d4[0]
vmov.32 r1, d2[0]
cmp r2, #0
beq 0f
mov r0, #9
bx lr
0:
mvns r1, r1
mov r0, #0
bxeq lr
.ifc \size, 15
lsr r1, r1, #2
.endif
rbit r1, r1
movrelx r3, X264(decimate_table4), r2
1:
clz r2, r1
lsl r1, r1, r2
lsr r12, r2, #1
ldrb r2, [r3, r12]
lsls r1, r1, #2
add r0, r0, r2
bne 1b
bx lr
endfunc
.endm
decimate_score_1x 15
decimate_score_1x 16
function decimate_score64_neon
push {lr}
vld1.16 {q8, q9}, [r0, :128]!
vld1.16 {q10, q11}, [r0, :128]!
vld1.16 {q12, q13}, [r0, :128]!
vld1.16 {q14, q15}, [r0, :128]
movrel r3, mask_1bit
vmov.s8 q3, #0x01
vqmovn.s16 d17, q8
vqmovn.s16 d16, q9
vqmovn.s16 d19, q10
vqmovn.s16 d18, q11
vqmovn.s16 d21, q12
vqmovn.s16 d20, q13
vqmovn.s16 d23, q14
vqmovn.s16 d22, q15
vqabs.s8 q12, q8
vqabs.s8 q13, q9
vqabs.s8 q14, q10
vqabs.s8 q15, q11
vld1.8 {q2}, [r3, :128]
vceq.s8 q8, q8, #0
vceq.s8 q9, q9, #0
vceq.s8 q10, q10, #0
vceq.s8 q11, q11, #0
vmax.s8 q12, q12, q13
vmax.s8 q14, q14, q15
vand.u8 q8, q8, q2
vand.u8 q9, q9, q2
vand.u8 q10, q10, q2
vand.u8 q11, q11, q2
vmax.s8 q12, q12, q14
vpadd.u8 d18, d18, d19
vpadd.u8 d19, d16, d17
vcgt.s8 q12, q12, q3
vpadd.u8 d22, d22, d23
vpadd.u8 d23, d20, d21
vshrn.u16 d24, q12, #4
vpadd.u8 d16, d22, d23
vpadd.u8 d17, d18, d19
vpadd.u8 d24, d24, d24
vpadd.u8 d16, d16, d17
vmov.32 r2, d24[0]
vmov r12, r1, d16
cmp r2, #0
beq 0f
mov r0, #9
pop {pc}
0:
mvns r1, r1
mvn r12, r12
mov r0, #0
mov lr, #32
movrelx r3, X264(decimate_table8), r2
beq 2f
1:
clz r2, r1
lsl r1, r1, r2
sub lr, lr, r2
ldrb r2, [r3, r2]
lsls r1, r1, #1
sub lr, lr, #1
add r0, r0, r2
bne 1b
2:
cmp r12, #0
popeq {pc}
clz r2, r12
lsl r1, r12, r2
add r2, r2, lr
ldrb r2, [r3, r2]
lsls r1, r1, #1
add r0, r0, r2
popeq {pc}
3:
clz r2, r1
lsl r1, r1, r2
ldrb r2, [r3, r2]
lsls r1, r1, #1
add r0, r0, r2
bne 3b
pop {pc}
endfunc
// int coeff_last( int16_t *l )
function coeff_last4_arm
ldrd r2, r3, [r0]
subs r0, r3, #0
movne r0, #2
movne r2, r3
lsrs r2, r2, #16
addne r0, r0, #1
bx lr
endfunc
function coeff_last8_arm
ldrd r2, r3, [r0, #8]
orrs ip, r2, r3
movne r0, #4
ldrdeq r2, r3, [r0]
moveq r0, #0
tst r3, r3
addne r0, #2
movne r2, r3
lsrs r2, r2, #16
addne r0, r0, #1
bx lr
endfunc
.macro COEFF_LAST_1x size
function coeff_last\size\()_neon
.if \size == 15
sub r0, r0, #2
.endif
vld1.64 {d0-d3}, [r0,:128]
vtst.16 q0, q0
vtst.16 q1, q1
vshrn.u16 d0, q0, #8
vshrn.u16 d1, q1, #8
vshrn.u16 d0, q0, #4
vclz.i32 d0, d0
mov ip, #7
mov r3, #\size - 9
vmov r0, r1, d0
subs r1, ip, r1, lsr #2
addge r0, r1, #\size - 8
subslt r0, r3, r0, lsr #2
movlt r0, #0
bx lr
endfunc
.endm
COEFF_LAST_1x 15
COEFF_LAST_1x 16
function coeff_last64_neon
vld1.64 {d16-d19}, [r0,:128]!
vqmovn.u16 d16, q8
vqmovn.u16 d17, q9
vld1.64 {d20-d23}, [r0,:128]!
vqmovn.u16 d18, q10
vqmovn.u16 d19, q11
vld1.64 {d24-d27}, [r0,:128]!
vqmovn.u16 d20, q12
vqmovn.u16 d21, q13
vld1.64 {d28-d31}, [r0,:128]!
vqmovn.u16 d22, q14
vqmovn.u16 d23, q15
movrel r1, pmovmskb_byte
vld1.64 {d0-d1}, [r1,:128]
vtst.8 q8, q8
vtst.8 q9, q9
vtst.8 q10, q10
vtst.8 q11, q11
vand q8, q8, q0
vand q9, q9, q0
vand q10, q10, q0
vand q11, q11, q0
vpadd.u8 d0, d16, d17
vpadd.u8 d1, d18, d19
vpadd.u8 d2, d20, d21
vpadd.u8 d3, d22, d23
vpadd.u8 d0, d0, d1
vpadd.u8 d1, d2, d3
vpadd.u8 d0, d0, d1
vclz.i32 d0, d0
mov ip, #31
vmov r0, r1, d0
subs r1, ip, r1
addge r0, r1, #32
subslt r0, ip, r0
movlt r0, #0
bx lr
endfunc
function denoise_dct_neon
1: subs r3, r3, #16
vld1.16 {q0, q1}, [r0]
vld1.32 {q12, q13}, [r1]!
vld1.32 {q14, q15}, [r1]
sub r1, #32
vabs.s16 q8, q0
vabs.s16 q9, q1
vld1.16 {q2, q3}, [r2]!
vclt.s16 q10, q0, #0
vclt.s16 q11, q1, #0
vaddw.u16 q12, q12, d16
vaddw.u16 q13, q13, d17
vqsub.u16 q0, q8, q2
vqsub.u16 q1, q9, q3
vaddw.u16 q14, q14, d18
vaddw.u16 q15, q15, d19
vneg.s16 q8, q0
vneg.s16 q9, q1
vbsl q10, q8, q0
vbsl q11, q9, q1
vst1.32 {q12, q13}, [r1]!
vst1.32 {q14, q15}, [r1]!
vst1.16 {q10, q11}, [r0]!
bgt 1b
bx lr
endfunc

71
common/arm/quant.h Normal file
View File

@@ -0,0 +1,71 @@
/*****************************************************************************
* quant.h: arm quantization and level-run
*****************************************************************************
* Copyright (C) 2005-2025 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_ARM_QUANT_H
#define X264_ARM_QUANT_H
#define x264_quant_2x2_dc_armv6 x264_template(quant_2x2_dc_armv6)
int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
int x264_decimate_score15_neon( int16_t * );
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
int x264_decimate_score16_neon( int16_t * );
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
int x264_decimate_score64_neon( int16_t * );
#define x264_coeff_last4_arm x264_template(coeff_last4_arm)
int x264_coeff_last4_arm( int16_t * );
#define x264_coeff_last8_arm x264_template(coeff_last8_arm)
int x264_coeff_last8_arm( int16_t * );
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
int x264_coeff_last15_neon( int16_t * );
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
int x264_coeff_last16_neon( int16_t * );
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
int x264_coeff_last64_neon( int16_t * );
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
#endif

1567
common/base.c Normal file

File diff suppressed because it is too large Load Diff

339
common/base.h Normal file
View File

@@ -0,0 +1,339 @@
/*****************************************************************************
* base.h: misc common functions (bit depth independent)
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_BASE_H
#define X264_BASE_H
/****************************************************************************
* Macros (can be used in osdep.h)
****************************************************************************/
#define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
#define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
#define X264_MIN3(a,b,c) X264_MIN((a),X264_MIN((b),(c)))
#define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
#define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
#define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
/****************************************************************************
* System includes
****************************************************************************/
#include "osdep.h"
#include <stdarg.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <limits.h>
/****************************************************************************
* Macros
****************************************************************************/
#define XCHG(type,a,b) do { type t = a; a = b; b = t; } while( 0 )
#define FIX8(f) ((int)(f*(1<<8)+.5))
#define ARRAY_ELEMS(a) ((int)((sizeof(a))/(sizeof(a[0]))))
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
/* Unions for type-punning.
* Mn: load or store n bits, aligned, native-endian
* CPn: copy n bits, aligned, native-endian
* we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
typedef union { uint16_t i; uint8_t b[2]; } MAY_ALIAS x264_union16_t;
typedef union { uint32_t i; uint16_t w[2]; uint8_t b[4]; } MAY_ALIAS x264_union32_t;
typedef union { uint64_t i; uint32_t d[2]; uint16_t w[4]; uint8_t b[8]; } MAY_ALIAS x264_union64_t;
typedef struct { uint64_t i[2]; } x264_uint128_t;
typedef union { x264_uint128_t i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_t;
#define M16(src) (((x264_union16_t*)(src))->i)
#define M32(src) (((x264_union32_t*)(src))->i)
#define M64(src) (((x264_union64_t*)(src))->i)
#define M128(src) (((x264_union128_t*)(src))->i)
#define M128_ZERO ((x264_uint128_t){{0,0}})
#define CP16(dst,src) M16(dst) = M16(src)
#define CP32(dst,src) M32(dst) = M32(src)
#define CP64(dst,src) M64(dst) = M64(src)
#define CP128(dst,src) M128(dst) = M128(src)
/* Macros for memory constraints of inline asm */
#if defined(__GNUC__) && __GNUC__ >= 8 && !defined(__clang__) && !defined(__INTEL_COMPILER)
#define MEM_FIX(x, t, s) (*(t (*)[s])(x))
#define MEM_DYN(x, t) (*(t (*)[])(x))
#else
//older versions of gcc prefer casting to structure instead of array
#define MEM_FIX(x, t, s) (*(struct { t a[s]; } MAY_ALIAS (*))(x))
//let's set an arbitrary large constant size
#define MEM_DYN(x, t) MEM_FIX(x, t, 4096)
#endif
/****************************************************************************
* Constants
****************************************************************************/
enum profile_e
{
PROFILE_BASELINE = 66,
PROFILE_MAIN = 77,
PROFILE_HIGH = 100,
PROFILE_HIGH10 = 110,
PROFILE_HIGH422 = 122,
PROFILE_HIGH444_PREDICTIVE = 244,
};
enum chroma_format_e
{
CHROMA_400 = 0,
CHROMA_420 = 1,
CHROMA_422 = 2,
CHROMA_444 = 3,
};
enum slice_type_e
{
SLICE_TYPE_P = 0,
SLICE_TYPE_B = 1,
SLICE_TYPE_I = 2,
};
static const char slice_type_to_char[] = { 'P', 'B', 'I' };
enum sei_payload_type_e
{
SEI_BUFFERING_PERIOD = 0,
SEI_PIC_TIMING = 1,
SEI_PAN_SCAN_RECT = 2,
SEI_FILLER = 3,
SEI_USER_DATA_REGISTERED = 4,
SEI_USER_DATA_UNREGISTERED = 5,
SEI_RECOVERY_POINT = 6,
SEI_DEC_REF_PIC_MARKING = 7,
SEI_FRAME_PACKING = 45,
SEI_MASTERING_DISPLAY = 137,
SEI_CONTENT_LIGHT_LEVEL = 144,
SEI_ALTERNATIVE_TRANSFER = 147,
};
#define X264_BFRAME_MAX 16
#define X264_REF_MAX 16
#define X264_THREAD_MAX 128
#define X264_LOOKAHEAD_THREAD_MAX 16
#define X264_LOOKAHEAD_MAX 250
// number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
#define X264_THREAD_HEIGHT 24
/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
* (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
* to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
* real weights are being used. */
#define X264_WEIGHTP_FAKE (-1)
#define X264_SCAN8_LUMA_SIZE (5*8)
#define X264_SCAN8_SIZE (X264_SCAN8_LUMA_SIZE*3)
#define X264_SCAN8_0 (4+1*8)
/* Scan8 organization:
* 0 1 2 3 4 5 6 7
* 0 DY y y y y y
* 1 y Y Y Y Y
* 2 y Y Y Y Y
* 3 y Y Y Y Y
* 4 y Y Y Y Y
* 5 DU u u u u u
* 6 u U U U U
* 7 u U U U U
* 8 u U U U U
* 9 u U U U U
* 10 DV v v v v v
* 11 v V V V V
* 12 v V V V V
* 13 v V V V V
* 14 v V V V V
* DY/DU/DV are for luma/chroma DC.
*/
#define LUMA_DC 48
#define CHROMA_DC 49
static const uint8_t x264_scan8[16*3 + 3] =
{
4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
4+11*8, 5+11*8, 4+12*8, 5+12*8,
6+11*8, 7+11*8, 6+12*8, 7+12*8,
4+13*8, 5+13*8, 4+14*8, 5+14*8,
6+13*8, 7+13*8, 6+14*8, 7+14*8,
0+ 0*8, 0+ 5*8, 0+10*8
};
/****************************************************************************
* Includes
****************************************************************************/
#include "cpu.h"
#include "tables.h"
/****************************************************************************
* Inline functions
****************************************************************************/
static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
{
return ( (v < i_min) ? i_min : (v > i_max) ? i_max : v );
}
static ALWAYS_INLINE double x264_clip3f( double v, double f_min, double f_max )
{
return ( (v < f_min) ? f_min : (v > f_max) ? f_max : v );
}
/* Not a general-purpose function; multiplies input by -1/6 to convert
* qp to qscale. */
static ALWAYS_INLINE int x264_exp2fix8( float x )
{
int i = x*(-64.f/6.f) + 512.5f;
if( i < 0 ) return 0;
if( i > 1023 ) return 0xffff;
return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
}
static ALWAYS_INLINE float x264_log2( uint32_t x )
{
int lz = x264_clz( x );
return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
}
static ALWAYS_INLINE int x264_median( int a, int b, int c )
{
int t = (a-b)&((a-b)>>31);
a -= t;
b += t;
b -= (b-c)&((b-c)>>31);
b += (a-b)&((a-b)>>31);
return b;
}
static ALWAYS_INLINE void x264_median_mv( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
{
dst[0] = x264_median( a[0], b[0], c[0] );
dst[1] = x264_median( a[1], b[1], c[1] );
}
static ALWAYS_INLINE int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
{
int sum = 0;
for( int i = 0; i < i_mvc-1; i++ )
{
sum += abs( mvc[i][0] - mvc[i+1][0] )
+ abs( mvc[i][1] - mvc[i+1][1] );
}
return sum;
}
static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
{
int amvd0 = mvdleft[0] + mvdtop[0];
int amvd1 = mvdleft[1] + mvdtop[1];
amvd0 = (amvd0 > 2) + (amvd0 > 32);
amvd1 = (amvd1 > 2) + (amvd1 > 32);
return amvd0 + (amvd1<<8);
}
/****************************************************************************
* General functions
****************************************************************************/
X264_API void x264_reduce_fraction( uint32_t *n, uint32_t *d );
X264_API void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
X264_API void x264_log_default( void *p_unused, int i_level, const char *psz_fmt, va_list arg );
X264_API void x264_log_internal( int i_level, const char *psz_fmt, ... );
/* x264_malloc: will do or emulate a memalign
* you have to use x264_free for buffers allocated with x264_malloc */
X264_API void *x264_malloc( int64_t );
X264_API void x264_free( void * );
/* x264_slurp_file: malloc space for the whole file and read it */
X264_API char *x264_slurp_file( const char *filename );
/* x264_param_strdup: will do strdup and save returned pointer inside
* x264_param_t for later freeing during x264_param_cleanup */
char *x264_param_strdup( x264_param_t *param, const char *src );
/* x264_param2string: return a (malloced) string containing most of
* the encoding options */
X264_API char *x264_param2string( x264_param_t *p, int b_res );
/****************************************************************************
* Macros
****************************************************************************/
#define CHECKED_MALLOC( var, size )\
do {\
var = x264_malloc( size );\
if( !var )\
goto fail;\
} while( 0 )
#define CHECKED_MALLOCZERO( var, size )\
do {\
CHECKED_MALLOC( var, size );\
memset( var, 0, size );\
} while( 0 )
#define CHECKED_PARAM_STRDUP( var, param, src )\
do {\
var = x264_param_strdup( param, src );\
if( !var )\
goto fail;\
} while( 0 )
/* Macros for merging multiple allocations into a single large malloc, for improved
* use with huge pages. */
/* Needs to be enough to contain any set of buffers that use combined allocations */
#define PREALLOC_BUF_SIZE 1024
#define PREALLOC_INIT\
int prealloc_idx = 0;\
int64_t prealloc_size = 0;\
uint8_t **preallocs[PREALLOC_BUF_SIZE];
#define PREALLOC( var, size )\
do {\
var = (void*)(intptr_t)prealloc_size;\
preallocs[prealloc_idx++] = (uint8_t**)&var;\
prealloc_size += ALIGN((int64_t)(size), NATIVE_ALIGN);\
} while( 0 )
#define PREALLOC_END( ptr )\
do {\
CHECKED_MALLOC( ptr, prealloc_size );\
while( prealloc_idx-- )\
*preallocs[prealloc_idx] = (uint8_t*)((intptr_t)(*preallocs[prealloc_idx]) + (intptr_t)ptr);\
} while( 0 )
#endif

166
common/bitstream.c Normal file
View File

@@ -0,0 +1,166 @@
/*****************************************************************************
* bitstream.c: bitstream writing
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common.h"
static uint8_t *nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
{
if( src < end ) *dst++ = *src++;
if( src < end ) *dst++ = *src++;
while( src < end )
{
if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
*dst++ = 0x03;
*dst++ = *src++;
}
return dst;
}
#if HAVE_MMX
#include "x86/bitstream.h"
#endif
#if HAVE_ARMV6
#include "arm/bitstream.h"
#endif
#if HAVE_AARCH64
#include "aarch64/bitstream.h"
#endif
/****************************************************************************
* x264_nal_encode:
****************************************************************************/
void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
{
uint8_t *src = nal->p_payload;
uint8_t *end = nal->p_payload + nal->i_payload;
uint8_t *orig_dst = dst;
if( h->param.b_annexb )
{
if( nal->b_long_startcode )
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x01;
}
else /* save room for size later */
dst += 4;
/* nal header */
*dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
dst = h->bsf.nal_escape( dst, src, end );
int size = dst - orig_dst;
/* Apply AVC-Intra padding */
if( h->param.i_avcintra_class )
{
int padding = nal->i_payload + nal->i_padding + NALU_OVERHEAD - size;
if( padding > 0 )
{
memset( dst, 0, padding );
size += padding;
}
nal->i_padding = X264_MAX( padding, 0 );
}
/* Write the size header for mp4/etc */
if( !h->param.b_annexb )
{
/* Size doesn't include the size of the header we're writing now. */
int chunk_size = size - 4;
orig_dst[0] = (uint8_t)(chunk_size >> 24);
orig_dst[1] = (uint8_t)(chunk_size >> 16);
orig_dst[2] = (uint8_t)(chunk_size >> 8);
orig_dst[3] = (uint8_t)(chunk_size >> 0);
}
nal->i_payload = size;
nal->p_payload = orig_dst;
x264_emms();
}
void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf )
{
memset( pf, 0, sizeof(*pf) );
pf->nal_escape = nal_escape_c;
#if HAVE_MMX
#if ARCH_X86_64
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
#endif
if( cpu&X264_CPU_MMX2 )
pf->nal_escape = x264_nal_escape_mmx2;
if( cpu&X264_CPU_SSE2 )
{
if( cpu&X264_CPU_SSE2_IS_FAST )
pf->nal_escape = x264_nal_escape_sse2;
}
#if ARCH_X86_64
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
{
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3;
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
}
}
if( cpu&X264_CPU_AVX2 )
{
pf->nal_escape = x264_nal_escape_avx2;
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
}
#endif
#endif
#if HAVE_ARMV6
if( cpu&X264_CPU_NEON )
pf->nal_escape = x264_nal_escape_neon;
#endif
#if HAVE_AARCH64
if( cpu&X264_CPU_NEON )
pf->nal_escape = x264_nal_escape_neon;
#endif
}

309
common/bitstream.h Normal file
View File

@@ -0,0 +1,309 @@
/*****************************************************************************
* bitstream.h: bitstream writing
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
* Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_BS_H
#define X264_BS_H
typedef struct
{
uint16_t i_bits;
uint8_t i_size;
/* Next level table to use */
uint8_t i_next;
} vlc_large_t;
typedef struct bs_s
{
uint8_t *p_start;
uint8_t *p;
uint8_t *p_end;
uintptr_t cur_bits;
int i_left; /* i_count number of available bits */
int i_bits_encoded; /* RD only */
} bs_t;
typedef struct
{
int32_t last;
int32_t mask;
ALIGNED_16( dctcoef level[18] );
} x264_run_level_t;
typedef struct
{
uint8_t *(*nal_escape)( uint8_t *dst, uint8_t *src, uint8_t *end );
void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced,
intptr_t ctx_block_cat, x264_cabac_t *cb );
void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced,
intptr_t ctx_block_cat, x264_cabac_t *cb );
void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced,
intptr_t ctx_block_cat, x264_cabac_t *cb );
} x264_bitstream_function_t;
#define x264_bitstream_init x264_template(bitstream_init)
void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf );
/* A larger level table size theoretically could help a bit at extremely
* high bitrates, but the cost in cache is usually too high for it to be
* useful.
* This size appears to be optimal for QP18 encoding on a Nehalem CPU.
* FIXME: Do further testing? */
#define LEVEL_TABLE_SIZE 128
#define x264_level_token x264_template(level_token)
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
/* The longest possible set of zero run codes sums to 25 bits. This leaves
* plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
#define x264_run_before x264_template(run_before)
extern uint32_t x264_run_before[1<<16];
static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
int offset = ((intptr_t)p_data & 3);
s->p = s->p_start = (uint8_t*)p_data - offset;
s->p_end = (uint8_t*)p_data + i_data;
s->i_left = (WORD_SIZE - offset)*8;
if( offset )
{
s->cur_bits = endian_fix32( M32(s->p) );
s->cur_bits >>= (4-offset)*8;
}
else
s->cur_bits = 0;
}
static inline int bs_pos( bs_t *s )
{
return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
}
/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
static inline void bs_flush( bs_t *s )
{
M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
s->p += WORD_SIZE - (s->i_left >> 3);
s->i_left = WORD_SIZE*8;
}
/* The inverse of bs_flush: prepare the bitstream to be written to again. */
static inline void bs_realign( bs_t *s )
{
int offset = ((intptr_t)s->p & 3);
if( offset )
{
s->p = (uint8_t*)s->p - offset;
s->i_left = (WORD_SIZE - offset)*8;
s->cur_bits = endian_fix32( M32(s->p) );
s->cur_bits >>= (4-offset)*8;
}
}
static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
{
if( WORD_SIZE == 8 )
{
s->cur_bits = (s->cur_bits << i_count) | i_bits;
s->i_left -= i_count;
if( s->i_left <= 32 )
{
#if WORDS_BIGENDIAN
M32( s->p ) = s->cur_bits >> (32 - s->i_left);
#else
M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
#endif
s->i_left += 32;
s->p += 4;
}
}
else
{
if( i_count < s->i_left )
{
s->cur_bits = (s->cur_bits << i_count) | i_bits;
s->i_left -= i_count;
}
else
{
i_count -= s->i_left;
s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
M32( s->p ) = endian_fix( s->cur_bits );
s->p += 4;
s->cur_bits = i_bits;
s->i_left = 32 - i_count;
}
}
}
/* Special case to eliminate branch in normal bs_write. */
/* Golomb never writes an even-size code, so this is only used in slice headers. */
static inline void bs_write32( bs_t *s, uint32_t i_bits )
{
bs_write( s, 16, i_bits >> 16 );
bs_write( s, 16, i_bits );
}
static inline void bs_write1( bs_t *s, uint32_t i_bit )
{
s->cur_bits <<= 1;
s->cur_bits |= i_bit;
s->i_left--;
if( s->i_left == WORD_SIZE*8-32 )
{
M32( s->p ) = endian_fix32( s->cur_bits );
s->p += 4;
s->i_left = WORD_SIZE*8;
}
}
static inline void bs_align_0( bs_t *s )
{
bs_write( s, s->i_left&7, 0 );
bs_flush( s );
}
static inline void bs_align_1( bs_t *s )
{
bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
bs_flush( s );
}
static inline void bs_align_10( bs_t *s )
{
if( s->i_left&7 )
bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
bs_flush( s );
}
/* golomb functions */
static const uint8_t x264_ue_size_tab[256] =
{
1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
};
static inline void bs_write_ue_big( bs_t *s, unsigned int val )
{
int size = 0;
int tmp = ++val;
if( tmp >= 0x10000 )
{
size = 32;
tmp >>= 16;
}
if( tmp >= 0x100 )
{
size += 16;
tmp >>= 8;
}
size += x264_ue_size_tab[tmp];
bs_write( s, size>>1, 0 );
bs_write( s, (size>>1)+1, val );
}
/* Only works on values under 255. */
static inline void bs_write_ue( bs_t *s, int val )
{
bs_write( s, x264_ue_size_tab[val+1], val+1 );
}
static inline void bs_write_se( bs_t *s, int val )
{
int size = 0;
/* Faster than (val <= 0 ? -val*2+1 : val*2) */
/* 4 instructions on x86, 3 on ARM */
int tmp = 1 - val*2;
if( tmp < 0 ) tmp = val*2;
val = tmp;
if( tmp >= 0x100 )
{
size = 16;
tmp >>= 8;
}
size += x264_ue_size_tab[tmp];
bs_write( s, size, val );
}
static inline void bs_write_te( bs_t *s, int x, int val )
{
if( x == 1 )
bs_write1( s, 1^val );
else //if( x > 1 )
bs_write_ue( s, val );
}
static inline void bs_rbsp_trailing( bs_t *s )
{
bs_write1( s, 1 );
bs_write( s, s->i_left&7, 0 );
}
static ALWAYS_INLINE int bs_size_ue( unsigned int val )
{
return x264_ue_size_tab[val+1];
}
static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
{
if( val < 255 )
return x264_ue_size_tab[val+1];
else
return x264_ue_size_tab[(val+1)>>8] + 16;
}
static ALWAYS_INLINE int bs_size_se( int val )
{
int tmp = 1 - val*2;
if( tmp < 0 ) tmp = val*2;
if( tmp < 256 )
return x264_ue_size_tab[tmp];
else
return x264_ue_size_tab[tmp>>8]+16;
}
static ALWAYS_INLINE int bs_size_te( int x, int val )
{
if( x == 1 )
return 1;
else //if( x > 1 )
return x264_ue_size_tab[val+1];
}
#endif

184
common/cabac.c Normal file
View File

@@ -0,0 +1,184 @@
/*****************************************************************************
* cabac.c: arithmetic coder
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common.h"
static uint8_t cabac_contexts[4][QP_MAX_SPEC+1][1024];
void x264_cabac_init( x264_t *h )
{
int ctx_count = CHROMA444 ? 1024 : 460;
for( int i = 0; i < 4; i++ )
{
const int8_t (*cabac_context_init)[1024][2] = i == 0 ? &x264_cabac_context_init_I
: &x264_cabac_context_init_PB[i-1];
for( int qp = 0; qp <= QP_MAX_SPEC; qp++ )
for( int j = 0; j < ctx_count; j++ )
{
int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 );
cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
}
}
}
void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
{
memcpy( cb->state, cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], CHROMA444 ? 1024 : 460 );
}
void x264_cabac_encode_init_core( x264_cabac_t *cb )
{
cb->i_low = 0;
cb->i_range = 0x01FE;
cb->i_queue = -9; // the first bit will be shifted away and not written
cb->i_bytes_outstanding = 0;
}
void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
{
x264_cabac_encode_init_core( cb );
cb->p_start = p_data;
cb->p = p_data;
cb->p_end = p_end;
}
static inline void cabac_putbyte( x264_cabac_t *cb )
{
if( cb->i_queue >= 0 )
{
int out = cb->i_low >> (cb->i_queue+10);
cb->i_low &= (0x400<<cb->i_queue)-1;
cb->i_queue -= 8;
if( (out & 0xff) == 0xff )
cb->i_bytes_outstanding++;
else
{
int carry = out >> 8;
int bytes_outstanding = cb->i_bytes_outstanding;
// this can't modify before the beginning of the stream because
// that would correspond to a probability > 1.
// it will write before the beginning of the stream, which is ok
// because a slice header always comes before cabac data.
// this can't carry beyond the one byte, because any 0xff bytes
// are in bytes_outstanding and thus not written yet.
cb->p[-1] += carry;
while( bytes_outstanding > 0 )
{
*(cb->p++) = (uint8_t)(carry-1);
bytes_outstanding--;
}
*(cb->p++) = (uint8_t)out;
cb->i_bytes_outstanding = 0;
}
}
}
static inline void cabac_encode_renorm( x264_cabac_t *cb )
{
int shift = x264_cabac_renorm_shift[cb->i_range>>3];
cb->i_range <<= shift;
cb->i_low <<= shift;
cb->i_queue += shift;
cabac_putbyte( cb );
}
/* Making custom versions of this function, even in asm, for the cases where
* b is known to be 0 or 1, proved to be somewhat useful on x86_32 with GCC 3.4
* but nearly useless with GCC 4.3 and worse than useless on x86_64. */
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
{
int i_state = cb->state[i_ctx];
int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
cb->i_range -= i_range_lps;
if( b != (i_state & 1) )
{
cb->i_low += cb->i_range;
cb->i_range = i_range_lps;
}
cb->state[i_ctx] = x264_cabac_transition[i_state][b];
cabac_encode_renorm( cb );
}
/* Note: b is negated for this function */
void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
{
cb->i_low <<= 1;
cb->i_low += b & cb->i_range;
cb->i_queue += 1;
cabac_putbyte( cb );
}
static const int bypass_lut[16] =
{
-1, 0x2, 0x14, 0x68, 0x1d0, 0x7a0, 0x1f40, 0x7e80,
0x1fd00, 0x7fa00, 0x1ff400, 0x7fe800, 0x1ffd000, 0x7ffa000, 0x1fff4000, 0x7ffe8000
};
void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
{
uint32_t v = val + (1<<exp_bits);
int k = 31 - x264_clz( v );
uint32_t x = ((uint32_t)bypass_lut[k-exp_bits]<<exp_bits) + v;
k = 2*k+1-exp_bits;
int i = ((k-1)&7)+1;
do {
k -= i;
cb->i_low <<= i;
cb->i_low += ((x>>k)&0xff) * cb->i_range;
cb->i_queue += i;
cabac_putbyte( cb );
i = 8;
} while( k > 0 );
}
void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
{
cb->i_range -= 2;
cabac_encode_renorm( cb );
}
void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb )
{
cb->i_low += cb->i_range - 2;
cb->i_low |= 1;
cb->i_low <<= 9;
cb->i_queue += 9;
cabac_putbyte( cb );
cabac_putbyte( cb );
cb->i_low <<= -cb->i_queue;
cb->i_low |= (0x35a4e4f5 >> (h->i_frame & 31) & 1) << 10;
cb->i_queue = 0;
cabac_putbyte( cb );
while( cb->i_bytes_outstanding > 0 )
{
*(cb->p++) = 0xff;
cb->i_bytes_outstanding--;
}
}

126
common/cabac.h Normal file
View File

@@ -0,0 +1,126 @@
/*****************************************************************************
* cabac.h: arithmetic coder
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_CABAC_H
#define X264_CABAC_H
typedef struct
{
/* state */
int i_low;
int i_range;
/* bit stream */
int i_queue; //stored with an offset of -8 for faster asm
int i_bytes_outstanding;
uint8_t *p_start;
uint8_t *p;
uint8_t *p_end;
/* aligned for memcpy_aligned starting here */
ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[1024];
/* for 16-byte alignment */
uint8_t padding[12];
} x264_cabac_t;
/* init the contexts given i_slice_type, the quantif and the model */
#define x264_cabac_context_init x264_template(cabac_context_init)
void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
#define x264_cabac_encode_init_core x264_template(cabac_encode_init_core)
void x264_cabac_encode_init_core( x264_cabac_t *cb );
#define x264_cabac_encode_init x264_template(cabac_encode_init)
void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
#define x264_cabac_encode_decision_c x264_template(cabac_encode_decision_c)
void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
#define x264_cabac_encode_decision_asm x264_template(cabac_encode_decision_asm)
void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
#define x264_cabac_encode_bypass_c x264_template(cabac_encode_bypass_c)
void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
#define x264_cabac_encode_bypass_asm x264_template(cabac_encode_bypass_asm)
void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
#define x264_cabac_encode_terminal_c x264_template(cabac_encode_terminal_c)
void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
#define x264_cabac_encode_terminal_asm x264_template(cabac_encode_terminal_asm)
void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
#define x264_cabac_encode_ue_bypass x264_template(cabac_encode_ue_bypass)
void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
#define x264_cabac_encode_flush x264_template(cabac_encode_flush)
void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
#if HAVE_MMX
#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
#elif HAVE_AARCH64
#define x264_cabac_encode_decision x264_cabac_encode_decision_asm
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
#else
#define x264_cabac_encode_decision x264_cabac_encode_decision_c
#define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
#define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
#endif
#define x264_cabac_encode_decision_noup x264_cabac_encode_decision
static ALWAYS_INLINE int x264_cabac_pos( x264_cabac_t *cb )
{
return (cb->p - cb->p_start + cb->i_bytes_outstanding) * 8 + cb->i_queue;
}
/* internal only. these don't write the bitstream, just calculate bit cost: */
static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx, long b )
{
int i_state = cb->state[i_ctx];
cb->state[i_ctx] = x264_cabac_transition[i_state][b];
cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
}
static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
{
int i_state = *state;
*state = x264_cabac_transition[i_state][b];
return x264_cabac_entropy[i_state^b];
}
static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
{
int i_state = cb->state[i_ctx];
cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
}
static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
{
return x264_cabac_entropy[*state^b];
}
#endif

44
common/common.c Normal file
View File

@@ -0,0 +1,44 @@
/*****************************************************************************
* common.c: misc common functions
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common.h"
/****************************************************************************
* x264_log:
****************************************************************************/
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
{
if( !h || i_level <= h->param.i_log_level )
{
va_list arg;
va_start( arg, psz_fmt );
if( !h )
x264_log_default( NULL, i_level, psz_fmt, arg );
else
h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
va_end( arg );
}
}

813
common/common.h Normal file
View File

@@ -0,0 +1,813 @@
/*****************************************************************************
* common.h: misc common functions
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_COMMON_H
#define X264_COMMON_H
#include "base.h"
/* Macros for templating function calls according to bit depth */
#define x264_template(w) x264_glue3(x264, BIT_DEPTH, w)
/****************************************************************************
* API Templates
****************************************************************************/
#define x264_nal_encode x264_template(nal_encode)
#define x264_encoder_reconfig x264_template(encoder_reconfig)
#define x264_encoder_parameters x264_template(encoder_parameters)
#define x264_encoder_headers x264_template(encoder_headers)
#define x264_encoder_encode x264_template(encoder_encode)
#define x264_encoder_close x264_template(encoder_close)
#define x264_encoder_delayed_frames x264_template(encoder_delayed_frames)
#define x264_encoder_maximum_delayed_frames x264_template(encoder_maximum_delayed_frames)
#define x264_encoder_intra_refresh x264_template(encoder_intra_refresh)
#define x264_encoder_invalidate_reference x264_template(encoder_invalidate_reference)
/* This undef allows to rename the external symbol and force link failure in case
* of incompatible libraries. Then the define enables templating as above. */
#undef x264_encoder_open
#define x264_encoder_open x264_template(encoder_open)
/****************************************************************************
* Macros
****************************************************************************/
#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
#define QP_MAX_SPEC (51+QP_BD_OFFSET)
#define QP_MAX (QP_MAX_SPEC+18)
#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
// arbitrary, but low because SATD scores are 1/4 normal
#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
#define SPEC_QP(x) X264_MIN((x), QP_MAX_SPEC)
#define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
#define FILLER_OVERHEAD (NALU_OVERHEAD+1)
#define SEI_OVERHEAD (NALU_OVERHEAD - (h->param.b_annexb && !h->param.i_avcintra_class && (h->out.i_nal-1)))
#if HAVE_INTERLACED
# define MB_INTERLACED h->mb.b_interlaced
# define SLICE_MBAFF h->sh.b_mbaff
# define PARAM_INTERLACED h->param.b_interlaced
#else
# define MB_INTERLACED 0
# define SLICE_MBAFF 0
# define PARAM_INTERLACED 0
#endif
#ifdef CHROMA_FORMAT
# define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422)
# define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420)
#else
# define CHROMA_FORMAT h->sps->i_chroma_format_idc
# define CHROMA_H_SHIFT h->mb.chroma_h_shift
# define CHROMA_V_SHIFT h->mb.chroma_v_shift
#endif
#define CHROMA_SIZE(s) (CHROMA_FORMAT ? (s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT) : 0)
#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
#if HIGH_BIT_DEPTH
typedef uint16_t pixel;
typedef uint64_t pixel4;
typedef int32_t dctcoef;
typedef uint32_t udctcoef;
# define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
# define MPIXEL_X4(src) M64(src)
#else
typedef uint8_t pixel;
typedef uint32_t pixel4;
typedef int16_t dctcoef;
typedef uint16_t udctcoef;
# define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
# define MPIXEL_X4(src) M32(src)
#endif
#define SIZEOF_PIXEL ((int)sizeof(pixel))
#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
/****************************************************************************
* Includes
****************************************************************************/
#if HAVE_OPENCL
#include "opencl.h"
#endif
#include "cabac.h"
#include "bitstream.h"
#include "set.h"
#include "predict.h"
#include "pixel.h"
#include "mc.h"
#include "frame.h"
#include "dct.h"
#include "quant.h"
#include "threadpool.h"
/****************************************************************************
* General functions
****************************************************************************/
/* log */
#define x264_log x264_template(log)
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
#define x264_cavlc_init x264_template(cavlc_init)
void x264_cavlc_init( x264_t *h );
#define x264_cabac_init x264_template(cabac_init)
void x264_cabac_init( x264_t *h );
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{
return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
}
/****************************************************************************
*
****************************************************************************/
typedef struct
{
x264_sps_t *sps;
x264_pps_t *pps;
int i_type;
int i_first_mb;
int i_last_mb;
int i_pps_id;
int i_frame_num;
int b_mbaff;
int b_field_pic;
int b_bottom_field;
int i_idr_pic_id; /* -1 if nal_type != 5 */
int i_poc;
int i_delta_poc_bottom;
int i_delta_poc[2];
int i_redundant_pic_cnt;
int b_direct_spatial_mv_pred;
int b_num_ref_idx_override;
int i_num_ref_idx_l0_active;
int i_num_ref_idx_l1_active;
int b_ref_pic_list_reordering[2];
struct
{
int idc;
int arg;
} ref_pic_list_order[2][X264_REF_MAX];
/* P-frame weighting */
int b_weighted_pred;
x264_weight_t weight[X264_REF_MAX*2][3];
int i_mmco_remove_from_end;
int i_mmco_command_count;
struct /* struct for future expansion */
{
int i_difference_of_pic_nums;
int i_poc;
} mmco[X264_REF_MAX];
int i_cabac_init_idc;
int i_qp;
int i_qp_delta;
int b_sp_for_swidth;
int i_qs_delta;
/* deblocking filter */
int i_disable_deblocking_filter_idc;
int i_alpha_c0_offset;
int i_beta_offset;
} x264_slice_header_t;
typedef struct x264_lookahead_t
{
volatile uint8_t b_exit_thread;
uint8_t b_thread_active;
uint8_t b_analyse_keyframe;
int i_last_keyframe;
int i_slicetype_length;
x264_frame_t *last_nonb;
x264_pthread_t thread_handle;
x264_sync_frame_list_t ifbuf;
x264_sync_frame_list_t next;
x264_sync_frame_list_t ofbuf;
} x264_lookahead_t;
typedef struct x264_ratecontrol_t x264_ratecontrol_t;
typedef struct x264_left_table_t
{
uint8_t intra[4];
uint8_t nnz[4];
uint8_t nnz_chroma[4];
uint8_t mv[4];
uint8_t ref[4];
} x264_left_table_t;
/* Current frame stats */
typedef struct
{
/* MV bits (MV+Ref+Block Type) */
int i_mv_bits;
/* Texture bits (DCT coefs) */
int i_tex_bits;
/* ? */
int i_misc_bits;
/* MB type counts */
int i_mb_count[19];
int i_mb_count_i;
int i_mb_count_p;
int i_mb_count_skip;
int i_mb_count_8x8dct[2];
int i_mb_count_ref[2][X264_REF_MAX*2];
int i_mb_partition[17];
int i_mb_cbp[6];
int i_mb_pred_mode[4][13];
int i_mb_field[3];
/* Adaptive direct mv pred */
int i_direct_score[2];
/* Metrics */
int64_t i_ssd[3];
double f_ssim;
int i_ssim_cnt;
} x264_frame_stat_t;
struct x264_t
{
/* encoder parameters */
x264_param_t param;
/* opaque pointer to bit depth independent interface */
void *api;
x264_t *thread[X264_THREAD_MAX+1];
x264_t *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
int i_thread_idx; /* which thread this is */
int i_threadslice_start; /* first row in this thread slice */
int i_threadslice_end; /* row after the end of this thread slice */
int i_threadslice_pass; /* which pass of encoding we are on */
x264_threadpool_t *threadpool;
x264_threadpool_t *lookaheadpool;
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv;
/* bitstream output */
struct
{
int i_nal;
int i_nals_allocated;
x264_nal_t *nal;
int i_bitstream; /* size of p_bitstream */
uint8_t *p_bitstream; /* will hold data for all nal */
bs_t bs;
} out;
uint8_t *nal_buffer;
int nal_buffer_size;
x264_t *reconfig_h;
int reconfig;
/**** thread synchronization starts here ****/
/* frame number/poc */
int i_frame;
int i_frame_num;
int i_thread_frames; /* Number of different frames being encoded by threads;
* 1 when sliced-threads is on. */
int i_nal_type;
int i_nal_ref_idc;
int64_t i_disp_fields; /* Number of displayed fields (both coded and implied via pic_struct) */
int i_disp_fields_last_frame;
int64_t i_prev_duration; /* Duration of previous frame */
int64_t i_coded_fields; /* Number of coded fields (both coded and implied via pic_struct) */
int64_t i_cpb_delay; /* Equal to number of fields preceding this field
* since last buffering_period SEI */
int64_t i_coded_fields_lookahead; /* Use separate counters for lookahead */
int64_t i_cpb_delay_lookahead;
int64_t i_cpb_delay_pir_offset;
int64_t i_cpb_delay_pir_offset_next;
int b_queued_intra_refresh;
int64_t i_last_idr_pts;
int i_idr_pic_id;
/* quantization matrix for decoding, [cqm][qp%6][coef] */
int (*dequant4_mf[4])[16]; /* [4][6][16] */
int (*dequant8_mf[4])[64]; /* [4][6][64] */
/* quantization matrix for trellis, [cqm][qp][coef] */
int (*unquant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
int (*unquant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
/* quantization matrix for deadzone */
udctcoef (*quant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
udctcoef (*quant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
udctcoef (*quant4_bias[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
udctcoef (*quant8_bias[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
udctcoef (*quant4_bias0[4])[16]; /* [4][QP_MAX_SPEC+1][16] */
udctcoef (*quant8_bias0[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
udctcoef (*nr_offset_emergency)[4][64];
/* mv/ref/mode cost arrays. */
uint16_t *cost_mv[QP_MAX+1];
uint16_t *cost_mv_fpel[QP_MAX+1][4];
struct
{
uint16_t ref[QP_MAX+1][3][33];
uint16_t i4x4_mode[QP_MAX+1][17];
} *cost_table;
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
/* Slice header */
x264_slice_header_t sh;
/* SPS / PPS */
x264_sps_t sps[1];
x264_pps_t pps[1];
/* Slice header backup, for SEI_DEC_REF_PIC_MARKING */
int b_sh_backup;
x264_slice_header_t sh_backup;
/* cabac context */
x264_cabac_t cabac;
struct
{
/* Frames to be encoded (whose types have been decided) */
x264_frame_t **current;
/* Unused frames: 0 = fenc, 1 = fdec */
x264_frame_t **unused[2];
/* Unused blank frames (for duplicates) */
x264_frame_t **blank_unused;
/* frames used for reference + sentinels */
x264_frame_t *reference[X264_REF_MAX+2];
int i_last_keyframe; /* Frame number of the last keyframe */
int i_last_idr; /* Frame number of the last IDR (not RP)*/
int i_poc_last_open_gop; /* Poc of the I frame of the last open-gop. The value
* is only assigned during the period between that
* I frame and the next P or I frame, else -1 */
int i_input; /* Number of input frames already accepted */
int i_max_dpb; /* Number of frames allocated in the decoded picture buffer */
int i_max_ref0;
int i_max_ref1;
int i_delay; /* Number of frames buffered for B reordering */
int i_bframe_delay;
int64_t i_bframe_delay_time;
int64_t i_first_pts;
int64_t i_prev_reordered_pts[2];
int64_t i_largest_pts;
int64_t i_second_largest_pts;
int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */
int b_have_sub8x8_esa;
} frames;
/* current frame being encoded */
x264_frame_t *fenc;
/* frame being reconstructed */
x264_frame_t *fdec;
/* references lists */
int i_ref[2];
x264_frame_t *fref[2][X264_REF_MAX+3];
x264_frame_t *fref_nearest[2];
int b_ref_reorder[2];
/* hrd */
int initial_cpb_removal_delay;
int initial_cpb_removal_delay_offset;
int64_t i_reordered_pts_delay;
/* Current MB DCT coeffs */
struct
{
ALIGNED_64( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
ALIGNED_64( dctcoef luma8x8[12][64] );
ALIGNED_64( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
struct
{
int i_mb_width;
int i_mb_height;
int i_mb_count; /* number of mbs in a frame */
/* Chroma subsampling */
int chroma_h_shift;
int chroma_v_shift;
/* Strides */
int i_mb_stride;
int i_b8_stride;
int i_b4_stride;
int left_b8[2];
int left_b4[2];
/* Current index */
int i_mb_x;
int i_mb_y;
int i_mb_xy;
int i_b8_xy;
int i_b4_xy;
/* Search parameters */
int i_me_method;
int i_subpel_refine;
int b_chroma_me;
int b_trellis;
int b_noise_reduction;
int b_dct_decimate;
int i_psy_rd; /* Psy RD strength--fixed point value*/
int i_psy_trellis; /* Psy trellis strength--fixed point value*/
int b_interlaced;
int b_adaptive_mbaff; /* MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs */
/* Allowed qpel MV range to stay within the picture + emulated edge pixels */
int mv_min[2];
int mv_max[2];
int mv_miny_row[3]; /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
int mv_maxy_row[3];
/* Subpel MV range for motion search.
* same mv_min/max but includes levels' i_mv_range. */
int mv_min_spel[2];
int mv_max_spel[2];
int mv_miny_spel_row[3];
int mv_maxy_spel_row[3];
/* Fullpel MV range for motion search */
ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
int mv_miny_fpel_row[3];
int mv_maxy_fpel_row[3];
/* neighboring MBs */
unsigned int i_neighbour;
unsigned int i_neighbour8[4]; /* neighbours of each 8x8 or 4x4 block that are available */
unsigned int i_neighbour4[16]; /* at the time the block is coded */
unsigned int i_neighbour_intra; /* for constrained intra pred */
unsigned int i_neighbour_frame; /* ignoring slice boundaries */
int i_mb_type_top;
int i_mb_type_left[2];
int i_mb_type_topleft;
int i_mb_type_topright;
int i_mb_prev_xy;
int i_mb_left_xy[2];
int i_mb_top_xy;
int i_mb_topleft_xy;
int i_mb_topright_xy;
int i_mb_top_y;
int i_mb_topleft_y;
int i_mb_topright_y;
const x264_left_table_t *left_index_table;
int i_mb_top_mbpair_xy;
int topleft_partition;
int b_allow_skip;
int field_decoding_flag;
/**** thread synchronization ends here ****/
/* subsequent variables are either thread-local or constant,
* and won't be copied from one thread to another */
/* mb table */
uint8_t *base; /* base pointer for all malloced data in this mb */
int8_t *type; /* mb type */
uint8_t *partition; /* mb partition */
int8_t *qp; /* mb qp */
int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */
int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
/* actually has only 7 entries; set to 8 for write-combining optimizations */
uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */
int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */
uint8_t (*mvd[2])[8][2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */
int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
int32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */
uint8_t *field;
/* buffer for weighted versions of the reference frames */
pixel *p_weight_buf[X264_REF_MAX];
/* current value */
int i_type;
int i_partition;
ALIGNED_4( uint8_t i_sub_partition[4] );
int b_transform_8x8;
int i_cbp_luma;
int i_cbp_chroma;
int i_intra16x16_pred_mode;
int i_chroma_pred_mode;
/* skip flags for i4x4 and i8x8
* 0 = encode as normal.
* 1 (non-RD only) = the DCT is still in h->dct, restore fdec and skip reconstruction.
* 2 (RD only) = the DCT has since been overwritten by RD; restore that too. */
int i_skip_intra;
/* skip flag for motion compensation */
/* if we've already done MC, we don't need to do it again */
int b_skip_mc;
/* set to true if we are re-encoding a macroblock. */
int b_reencode_mb;
int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */
int b_deblock_rdo;
int b_overflow; /* If CAVLC had a level code overflow during bitstream writing. */
struct
{
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_32( pixel i4x4_fdec_buf[16*16] );
ALIGNED_32( pixel i8x8_fdec_buf[16*16] );
ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
/* Psy trellis DCT data */
ALIGNED_64( dctcoef fenc_dct8[4][64] );
ALIGNED_64( dctcoef fenc_dct4[16][16] );
/* Psy RD SATD/SA8D scores cache */
ALIGNED_64( uint32_t fenc_satd_cache[32] );
ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
int i4x4_cbp;
int i8x8_cbp;
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
/* pointer to the actual source frame, not a block copy */
pixel *p_fenc_plane[3];
/* pointer over mb of the frame to be reconstructed */
pixel *p_fdec[3];
/* pointer over mb of the references */
int i_fref[2];
/* [12]: yN, yH, yV, yHV, (NV12 ? uv : I444 ? (uN, uH, uV, uHV, vN, ...)) */
pixel *p_fref[2][X264_REF_MAX*2][12];
pixel *p_fref_w[X264_REF_MAX*2]; /* weighted fullpel luma */
uint16_t *p_integral[2][X264_REF_MAX];
/* fref stride */
int i_stride[3];
} pic;
/* cache */
struct
{
/* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
/* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */
ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* -1 if unused, -2 if unavailable */
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
/* 0 if not available */
ALIGNED_16( int16_t mv[2][X264_SCAN8_LUMA_SIZE][2] );
ALIGNED_8( uint8_t mvd[2][X264_SCAN8_LUMA_SIZE][2] );
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
ALIGNED_4( int8_t skip[X264_SCAN8_LUMA_SIZE] );
ALIGNED_4( int16_t direct_mv[2][4][2] );
ALIGNED_4( int8_t direct_ref[2][4] );
int direct_partition;
ALIGNED_4( int16_t pskip_mv[2] );
/* number of neighbors (top and left) that used 8x8 dct */
int i_neighbour_transform_size;
int i_neighbour_skip;
/* neighbor CBPs */
int i_cbp_top;
int i_cbp_left;
/* extra data required for mbaff in mv prediction */
int16_t topright_mv[2][3][2];
int8_t topright_ref[2][3];
/* current mb deblock strength */
uint8_t (*deblock_strength)[8][4];
} cache;
/* */
int i_qp; /* current qp */
int i_chroma_qp;
int i_last_qp; /* last qp */
int i_last_dqp; /* last delta qp */
int b_variable_qp; /* whether qp is allowed to vary per macroblock */
int b_lossless;
int b_direct_auto_read; /* take stats for --direct auto from the 2pass log */
int b_direct_auto_write; /* analyse direct modes, to use and/or save */
/* lambda values */
int i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */
int i_psy_rd_lambda;
int i_chroma_lambda2_offset;
/* B_direct and weighted prediction */
int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4];
int16_t (*dist_scale_factor)[4];
int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4];
int8_t (*bipred_weight)[4];
/* maps fref1[0]'s ref indices into the current list0 */
#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
int8_t map_col_to_list0[X264_REF_MAX+2];
int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
int8_t deblock_ref_table[X264_REF_MAX*2+2];
#define deblock_ref_table(x) h->mb.deblock_ref_table[(x)+2]
} mb;
/* rate control encoding only */
x264_ratecontrol_t *rc;
/* stats */
struct
{
/* Cumulated stats */
/* per slice info */
int i_frame_count[3];
int64_t i_frame_size[3];
double f_frame_qp[3];
int i_consecutive_bframes[X264_BFRAME_MAX+1];
/* */
double f_ssd_global[3];
double f_psnr_average[3];
double f_psnr_mean_y[3];
double f_psnr_mean_u[3];
double f_psnr_mean_v[3];
double f_ssim_mean_y[3];
double f_frame_duration[3];
/* */
int64_t i_mb_count[3][19];
int64_t i_mb_partition[2][17];
int64_t i_mb_count_8x8dct[2];
int64_t i_mb_count_ref[2][2][X264_REF_MAX*2];
int64_t i_mb_cbp[6];
int64_t i_mb_pred_mode[4][13];
int64_t i_mb_field[3];
/* */
int i_direct_score[2];
int i_direct_frames[2];
/* num p-frames weighted */
int i_wpred[2];
/* Current frame stats */
x264_frame_stat_t frame;
} stat;
/* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */
udctcoef (*nr_offset)[64];
uint32_t (*nr_residual_sum)[64];
uint32_t *nr_count;
ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
uint32_t nr_count_buf[2][4];
uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
/* Buffers that are allocated per-thread even in sliced threads. */
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
void *scratch_buffer2; /* if the first one's already in use */
pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
/* Deblock strength values are stored for each 4x4 partition. In MBAFF
* there are four extra values that need to be stored, located in [4][i]. */
uint8_t (*deblock_strength[2])[2][8][4];
/* CPU functions dependents */
x264_predict_t predict_16x16[4+3];
x264_predict8x8_t predict_8x8[9+3];
x264_predict_t predict_4x4[9+3];
x264_predict_t predict_chroma[4+3];
x264_predict_t predict_8x8c[4+3];
x264_predict_t predict_8x16c[4+3];
x264_predict_8x8_filter_t predict_8x8_filter;
x264_pixel_function_t pixf;
x264_mc_functions_t mc;
x264_dct_function_t dctf;
x264_zigzag_function_t zigzagf;
x264_zigzag_function_t zigzagf_interlaced;
x264_zigzag_function_t zigzagf_progressive;
x264_quant_function_t quantf;
x264_deblock_function_t loopf;
x264_bitstream_function_t bsf;
x264_lookahead_t *lookahead;
#if HAVE_OPENCL
x264_opencl_t opencl;
#endif
};
typedef struct
{
int sad;
int16_t mv[2];
} mvsad_t;
// included at the end because it needs x264_t
#include "macroblock.h"
static ALWAYS_INLINE int x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
int cnt = 0;
for( int i = 0; i < i_mvc; i++ )
{
int mx = (mvc[i][0] + 2) >> 2;
int my = (mvc[i][1] + 2) >> 2;
uint32_t mv = pack16to32_mask(mx, my);
if( !mv || mv == pmv ) continue;
dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
cnt++;
}
return cnt;
}
static ALWAYS_INLINE int x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
int cnt = 0;
int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
for( int i = 0; i < i_mvc; i++ )
{
uint32_t mv = M32( mvc[i] );
int mx = mvc[i][0];
int my = mvc[i][1];
if( !mv || mv == pmv ) continue;
dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
cnt++;
}
return cnt;
}
#if ARCH_X86 || ARCH_X86_64
#include "x86/util.h"
#endif
#include "rectangle.h"
#endif

679
common/cpu.c Normal file
View File

@@ -0,0 +1,679 @@
/*****************************************************************************
* cpu.c: cpu detection
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "base.h"
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
#include <sys/auxv.h>
#endif
#if HAVE_SYSCONF
#include <unistd.h>
#endif
#if SYS_LINUX
#include <sched.h>
#endif
#if SYS_BEOS
#include <kernel/OS.h>
#endif
#if SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if SYS_OPENBSD
#include <machine/cpu.h>
#endif
const x264_cpu_name_t x264_cpu_names[] =
{
#if ARCH_X86 || ARCH_X86_64
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
{"MMX2", MMX2},
{"MMXEXT", MMX2},
{"SSE", MMX2|X264_CPU_SSE},
#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
{"LZCNT", SSE2|X264_CPU_LZCNT},
{"SSE3", SSE2|X264_CPU_SSE3},
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
#define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
{"AVX", AVX},
{"XOP", AVX|X264_CPU_XOP},
{"FMA4", AVX|X264_CPU_FMA4},
{"FMA3", AVX|X264_CPU_FMA3},
{"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1},
{"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
{"AVX2", AVX2},
{"AVX512", AVX2|X264_CPU_AVX512},
#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
{"SlowShuffle", X264_CPU_SLOW_SHUFFLE},
{"UnalignedStack", X264_CPU_STACK_MOD4},
#elif ARCH_PPC
{"Altivec", X264_CPU_ALTIVEC},
#elif ARCH_ARM
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"FastNeonMRC", X264_CPU_FAST_NEON_MRC},
#elif ARCH_AARCH64
{"ARMv8", X264_CPU_ARMV8},
{"NEON", X264_CPU_NEON},
{"DotProd", X264_CPU_DOTPROD},
{"I8MM", X264_CPU_I8MM},
{"SVE", X264_CPU_SVE},
{"SVE2", X264_CPU_SVE2},
#elif ARCH_RISCV64
{"RVV", X264_CPU_RVV},
#elif ARCH_MIPS
{"MSA", X264_CPU_MSA},
#elif ARCH_LOONGARCH
{"LSX", X264_CPU_LSX},
{"LASX", X264_CPU_LASX},
#endif
{"", 0},
};
static unsigned long x264_getauxval( unsigned long type )
{
#if HAVE_GETAUXVAL
return getauxval( type );
#elif HAVE_ELF_AUX_INFO
unsigned long aux = 0;
elf_aux_info( type, &aux, sizeof(aux) );
return aux;
#else
return 0;
#endif
}
#if ((HAVE_ALTIVEC && SYS_LINUX) || (HAVE_ARMV6 && !HAVE_NEON)) && !(HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO)
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;
static void sigill_handler( int sig )
{
if( !canjump )
{
signal( sig, SIG_DFL );
raise( sig );
}
canjump = 0;
siglongjmp( jmpbuf, 1 );
}
#endif
#if HAVE_MMX
int x264_cpu_cpuid_test( void );
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
uint64_t x264_cpu_xgetbv( int xcr );
uint32_t x264_cpu_detect( void )
{
uint32_t cpu = 0;
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = {0};
uint32_t max_extended_cap, max_basic_cap;
#if !ARCH_X86_64
if( !x264_cpu_cpuid_test() )
return 0;
#endif
x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
if( max_basic_cap == 0 )
return 0;
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
if( edx&0x00800000 )
cpu |= X264_CPU_MMX;
else
return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
{
uint64_t xcr0 = x264_cpu_xgetbv( 0 );
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
if( ecx&0x10000000 )
cpu |= X264_CPU_AVX;
if( ecx&0x00001000 )
cpu |= X264_CPU_FMA3;
if( max_basic_cap >= 7 )
{
x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
if( ebx&0x00000008 )
cpu |= X264_CPU_BMI1;
if( ebx&0x00000100 )
cpu |= X264_CPU_BMI2;
if( ebx&0x00000020 )
cpu |= X264_CPU_AVX2;
if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
{
if( (ebx&0xD0030000) == 0xD0030000 )
cpu |= X264_CPU_AVX512;
}
}
}
}
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
if( max_extended_cap >= 0x80000001 )
{
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
if( ecx&0x00000020 )
cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
if( ecx&0x00000040 ) /* SSE4a, AMD only */
{
int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
if( family == 0x14 )
{
cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
}
if( family == 0x16 )
{
cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
* compared to alternate instruction sequences that this
* is equal or faster on almost all such functions. */
}
}
if( cpu & X264_CPU_AVX )
{
if( ecx&0x00000800 ) /* XOP */
cpu |= X264_CPU_XOP;
if( ecx&0x00010000 ) /* FMA4 */
cpu |= X264_CPU_FMA4;
}
if( !strcmp((char*)vendor, "AuthenticAMD") )
{
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
}
if( !strcmp((char*)vendor, "GenuineIntel") )
{
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
if( family == 6 )
{
/* Detect Atom CPU */
if( model == 28 )
{
cpu |= X264_CPU_SLOW_ATOM;
cpu |= X264_CPU_SLOW_PSHUFB;
}
/* Conroe has a slow shuffle unit. Check the model number to make sure not
* to include crippled low-end Penryns and Nehalems that don't have SSE4. */
else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
cpu |= X264_CPU_SLOW_SHUFFLE;
}
}
if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
int cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
cache = ecx&0xff; // cacheline size
}
if( !cache && max_basic_cap >= 2 )
{
// Cache and TLB Information
static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
uint32_t buf[4];
int max, i = 0;
do {
x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
max = buf[0]&0xff;
buf[0] &= ~0xff;
for( int j = 0; j < 4; j++ )
if( !(buf[j]>>31) )
while( buf[j] )
{
if( strchr( cache32_ids, buf[j]&0xff ) )
cache = 32;
if( strchr( cache64_ids, buf[j]&0xff ) )
cache = 64;
buf[j] >>= 8;
}
} while( ++i < max );
}
if( cache == 32 )
cpu |= X264_CPU_CACHELINE_32;
else if( cache == 64 )
cpu |= X264_CPU_CACHELINE_64;
else
x264_log_internal( X264_LOG_WARNING, "unable to determine cacheline size\n" );
}
#if STACK_ALIGNMENT < 16
cpu |= X264_CPU_STACK_MOD4;
#endif
return cpu;
}
#elif HAVE_ALTIVEC
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
#define HWCAP_PPC_ALTIVEC (1U << 28)
uint32_t x264_cpu_detect( void )
{
uint32_t flags = 0;
unsigned long hwcap = x264_getauxval( AT_HWCAP );
if ( hwcap & HWCAP_PPC_ALTIVEC )
flags |= X264_CPU_ALTIVEC;
return flags;
}
#elif SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD
uint32_t x264_cpu_detect( void )
{
/* Thank you VLC */
uint32_t cpu = 0;
#if SYS_OPENBSD
int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
#elif SYS_MACOSX
int selectors[2] = { CTL_HW, HW_VECTORUNIT };
#endif
int has_altivec = 0;
size_t length = sizeof( has_altivec );
#if SYS_MACOSX || SYS_OPENBSD
int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
#elif SYS_NETBSD
int error = sysctlbyname( "machdep.altivec", &has_altivec, &length, NULL, 0 );
#else
int error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 );
#endif
if( error == 0 && has_altivec != 0 )
cpu |= X264_CPU_ALTIVEC;
return cpu;
}
#elif SYS_LINUX
uint32_t x264_cpu_detect( void )
{
#ifdef __NO_FPRS__
return 0;
#else
static void (*oldsig)( int );
oldsig = signal( SIGILL, sigill_handler );
if( sigsetjmp( jmpbuf, 1 ) )
{
signal( SIGILL, oldsig );
return 0;
}
canjump = 1;
asm volatile( "mtspr 256, %0\n\t"
"vand 0, 0, 0\n\t"
:
: "r"(-1) );
canjump = 0;
signal( SIGILL, oldsig );
return X264_CPU_ALTIVEC;
#endif
}
#else
uint32_t x264_cpu_detect( void )
{
return 0;
}
#endif
#elif HAVE_ARMV6
void x264_cpu_neon_test( void );
int x264_cpu_fast_neon_mrc_test( void );
#define HWCAP_ARM_NEON (1U << 12)
uint32_t x264_cpu_detect( void )
{
uint32_t flags = 0;
flags |= X264_CPU_ARMV6;
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
unsigned long hwcap = x264_getauxval( AT_HWCAP );
if ( hwcap & HWCAP_ARM_NEON )
flags |= X264_CPU_NEON;
#else
// don't do this hack if compiled with -mfpu=neon
#if !HAVE_NEON
static void (* oldsig)( int );
oldsig = signal( SIGILL, sigill_handler );
if( sigsetjmp( jmpbuf, 1 ) )
{
signal( SIGILL, oldsig );
return flags;
}
canjump = 1;
x264_cpu_neon_test();
canjump = 0;
signal( SIGILL, oldsig );
#endif
flags |= X264_CPU_NEON;
#endif
// fast neon -> arm (Cortex-A9) detection relies on user access to the
// cycle counter; this assumes ARMv7 performance counters.
// NEON requires at least ARMv7, ARMv8 may require changes here, but
// hopefully this hacky detection method will have been replaced by then.
// Note that there is potential for a race condition if another program or
// x264 instance disables or reinits the counters while x264 is using them,
// which may result in incorrect detection and the counters stuck enabled.
// right now Apple does not seem to support performance counters for this test
// Don't test this on Windows; performance counters are readable, but
// the PMNC is not readable.
#if !defined(__MACH__) && !defined(_WIN32)
flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
#endif
// TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
return flags;
}
#elif HAVE_RISCV64
#define HWCAP_RISCV64_RVV (1 << ('V' - 'A'))
uint32_t x264_cpu_detect( void )
{
uint32_t flags = 0;
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
unsigned long hwcap = x264_getauxval( AT_HWCAP );
if ( hwcap & HWCAP_RISCV64_RVV )
flags |= X264_CPU_RVV;
#else
#if HAVE_RVV
flags |= X264_CPU_RVV;
#endif
#endif
return flags;
}
#elif HAVE_AARCH64
#if defined(__linux__) || HAVE_ELF_AUX_INFO
#define HWCAP_AARCH64_ASIMDDP (1U << 20)
#define HWCAP_AARCH64_SVE (1U << 22)
#define HWCAP2_AARCH64_SVE2 (1U << 1)
#define HWCAP2_AARCH64_I8MM (1U << 13)
static uint32_t detect_flags( void )
{
uint32_t flags = 0;
unsigned long hwcap = x264_getauxval( AT_HWCAP );
unsigned long hwcap2 = x264_getauxval( AT_HWCAP2 );
if ( hwcap & HWCAP_AARCH64_ASIMDDP )
flags |= X264_CPU_DOTPROD;
if ( hwcap2 & HWCAP2_AARCH64_I8MM )
flags |= X264_CPU_I8MM;
if ( hwcap & HWCAP_AARCH64_SVE )
flags |= X264_CPU_SVE;
if ( hwcap2 & HWCAP2_AARCH64_SVE2 )
flags |= X264_CPU_SVE2;
return flags;
}
#elif defined(__APPLE__)
#include <sys/sysctl.h>
static int have_feature( const char *feature )
{
int supported = 0;
size_t size = sizeof(supported);
if ( sysctlbyname( feature, &supported, &size, NULL, 0 ) )
return 0;
return supported;
}
static uint32_t detect_flags( void )
{
uint32_t flags = 0;
if ( have_feature( "hw.optional.arm.FEAT_DotProd" ) )
flags |= X264_CPU_DOTPROD;
if ( have_feature( "hw.optional.arm.FEAT_I8MM" ) )
flags |= X264_CPU_I8MM;
/* No SVE and SVE2 feature detection available on Apple platforms. */
return flags;
}
#elif defined(_WIN32)
#include <windows.h>
static uint32_t detect_flags( void )
{
uint32_t flags = 0;
#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
if ( IsProcessorFeaturePresent( PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE ) )
flags |= X264_CPU_DOTPROD;
#endif
#ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
if ( IsProcessorFeaturePresent( PF_ARM_SVE_INSTRUCTIONS_AVAILABLE ) )
flags |= X264_CPU_SVE;
#endif
#ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
if ( IsProcessorFeaturePresent( PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE ) )
flags |= X264_CPU_SVE2;
#endif
#ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
/* There's no PF_* flag that indicates whether plain I8MM is available
* or not. But if SVE_I8MM is available, that also implies that
* regular I8MM is available. */
if ( IsProcessorFeaturePresent( PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE ) )
flags |= X264_CPU_I8MM;
#endif
return flags;
}
#endif
uint32_t x264_cpu_detect( void )
{
uint32_t flags = X264_CPU_ARMV8;
#if HAVE_NEON
flags |= X264_CPU_NEON;
#endif
// If these features are enabled unconditionally in the compiler, we can
// assume that they are available.
#ifdef __ARM_FEATURE_DOTPROD
flags |= X264_CPU_DOTPROD;
#endif
#ifdef __ARM_FEATURE_MATMUL_INT8
flags |= X264_CPU_I8MM;
#endif
#ifdef __ARM_FEATURE_SVE
flags |= X264_CPU_SVE;
#endif
#ifdef __ARM_FEATURE_SVE2
flags |= X264_CPU_SVE2;
#endif
// Where possible, try to do runtime detection as well.
#if defined(__linux__) || HAVE_ELF_AUX_INFO || \
defined(__APPLE__) || defined(_WIN32)
flags |= detect_flags();
#endif
return flags;
}
#elif HAVE_MSA
uint32_t x264_cpu_detect( void )
{
return X264_CPU_MSA;
}
#elif HAVE_LSX
#define LA_HWCAP_LSX ( 1U << 4 )
#define LA_HWCAP_LASX ( 1U << 5 )
uint32_t x264_cpu_detect( void )
{
uint32_t flags = 0;
uint32_t hwcap = (uint32_t)x264_getauxval( AT_HWCAP );
if( hwcap & LA_HWCAP_LSX )
flags |= X264_CPU_LSX;
if( hwcap & LA_HWCAP_LASX )
flags |= X264_CPU_LASX;
return flags;
}
#else
uint32_t x264_cpu_detect( void )
{
return 0;
}
#endif
int x264_cpu_num_processors( void )
{
#if !HAVE_THREAD
return 1;
#elif SYS_WINDOWS
return x264_pthread_num_processors_np();
#elif SYS_LINUX
cpu_set_t p_aff;
memset( &p_aff, 0, sizeof(p_aff) );
if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
return 1;
#if HAVE_CPU_COUNT
return CPU_COUNT(&p_aff);
#else
int np = 0;
for( size_t bit = 0; bit < 8 * sizeof(p_aff); bit++ )
np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
return np;
#endif
#elif SYS_BEOS
system_info info;
get_system_info( &info );
return info.cpu_count;
#elif SYS_MACOSX
int ncpu;
size_t length = sizeof( ncpu );
if( sysctlbyname("hw.logicalcpu", &ncpu, &length, NULL, 0) )
{
ncpu = 1;
}
return ncpu;
#elif defined(_SC_NPROCESSORS_ONLN)
return sysconf( _SC_NPROCESSORS_ONLN );
#elif defined(_SC_NPROCESSORS_CONF)
return sysconf( _SC_NPROCESSORS_CONF );
#else
return 1;
#endif
}

56
common/cpu.h Normal file
View File

@@ -0,0 +1,56 @@
/*****************************************************************************
* cpu.h: cpu detection
*****************************************************************************
* Copyright (C) 2004-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_CPU_H
#define X264_CPU_H
X264_API uint32_t x264_cpu_detect( void );
X264_API int x264_cpu_num_processors( void );
void x264_cpu_emms( void );
void x264_cpu_sfence( void );
#if HAVE_MMX
/* There is no way to forbid the compiler from using float instructions
* before the emms so miscompilation could theoretically occur in the
* unlikely event that the compiler reorders emms and float instructions. */
#if HAVE_X86_INLINE_ASM
/* Clobbering memory makes the compiler less likely to reorder code. */
#define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \
"st(3)","st(4)","st(5)","st(6)","st(7)" )
#else
#define x264_emms() x264_cpu_emms()
#endif
#else
#define x264_emms()
#endif
#define x264_sfence x264_cpu_sfence
typedef struct
{
const char *name;
uint32_t flags;
} x264_cpu_name_t;
X264_API extern const x264_cpu_name_t x264_cpu_names[];
#endif

1150
common/dct.c Normal file

File diff suppressed because it is too large Load Diff

77
common/dct.h Normal file
View File

@@ -0,0 +1,77 @@
/*****************************************************************************
* dct.h: transform and zigzag
*****************************************************************************
* Copyright (C) 2004-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_DCT_H
#define X264_DCT_H
typedef struct
{
// pix1 stride = FENC_STRIDE
// pix2 stride = FDEC_STRIDE
// p_dst stride = FDEC_STRIDE
void (*sub4x4_dct) ( dctcoef dct[16], pixel *pix1, pixel *pix2 );
void (*add4x4_idct)( pixel *p_dst, dctcoef dct[16] );
void (*sub8x8_dct) ( dctcoef dct[4][16], pixel *pix1, pixel *pix2 );
void (*sub8x8_dct_dc) ( dctcoef dct[4], pixel *pix1, pixel *pix2 );
void (*add8x8_idct) ( pixel *p_dst, dctcoef dct[4][16] );
void (*add8x8_idct_dc)( pixel *p_dst, dctcoef dct[4] );
void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
void (*add16x16_idct) ( pixel *p_dst, dctcoef dct[16][16] );
void (*add16x16_idct_dc)( pixel *p_dst, dctcoef dct[16] );
void (*sub8x8_dct8) ( dctcoef dct[64], pixel *pix1, pixel *pix2 );
void (*add8x8_idct8)( pixel *p_dst, dctcoef dct[64] );
void (*sub16x16_dct8) ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void (*add16x16_idct8)( pixel *p_dst, dctcoef dct[4][64] );
void (*dct4x4dc) ( dctcoef d[16] );
void (*idct4x4dc)( dctcoef d[16] );
void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
} x264_dct_function_t;
typedef struct
{
void (*scan_8x8)( dctcoef level[64], dctcoef dct[64] );
void (*scan_4x4)( dctcoef level[16], dctcoef dct[16] );
int (*sub_8x8) ( dctcoef level[64], const pixel *p_src, pixel *p_dst );
int (*sub_4x4) ( dctcoef level[16], const pixel *p_src, pixel *p_dst );
int (*sub_4x4ac)( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
void (*interleave_8x8_cavlc)( dctcoef *dst, dctcoef *src, uint8_t *nnz );
} x264_zigzag_function_t;
#define x264_dct_init x264_template(dct_init)
void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf );
#define x264_zigzag_init x264_template(zigzag_init)
void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );
#endif

851
common/deblock.c Normal file
View File

@@ -0,0 +1,851 @@
/*****************************************************************************
* deblock.c: deblocking
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
* Henrik Gramner <henrik@gramner.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common.h"
/* Deblocking filter */
static const uint8_t i_alpha_table[52+12*3] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
7, 8, 9, 10, 12, 13, 15, 17, 20, 22,
25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
80, 90,101,113,127,144,162,182,203,226,
255,255,
255,255,255,255,255,255,255,255,255,255,255,255,
};
static const uint8_t i_beta_table[52+12*3] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
3, 3, 3, 4, 4, 4, 6, 6, 7, 7,
8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
};
static const int8_t i_tc0_table[52+12*3][4] =
{
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
{-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
{-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
{-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
{-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
{-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
{-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
};
#define alpha_table(x) i_alpha_table[(x)+24]
#define beta_table(x) i_beta_table[(x)+24]
#define tc0_table(x) i_tc0_table[(x)+24]
/* From ffmpeg */
static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 )
{
int p2 = pix[-3*xstride];
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
int q0 = pix[ 0*xstride];
int q1 = pix[ 1*xstride];
int q2 = pix[ 2*xstride];
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
{
int tc = tc0;
int delta;
if( abs( p2 - p0 ) < beta )
{
if( tc0 )
pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 );
tc++;
}
if( abs( q2 - q0 ) < beta )
{
if( tc0 )
pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 );
tc++;
}
delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
}
static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
if( tc0[i] < 0 )
{
pix += 4*ystride;
continue;
}
for( int d = 0; d < 4; d++, pix += ystride )
deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
for( int d = 0; d < 8; d++, pix += stride )
deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
}
static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
}
static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
}
static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
int q0 = pix[ 0*xstride];
int q1 = pix[ 1*xstride];
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
{
int delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
}
static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
int tc = tc0[i];
if( tc <= 0 )
{
pix += height*ystride;
continue;
}
for( int d = 0; d < height; d++, pix += ystride-2 )
for( int e = 0; e < 2; e++, pix++ )
deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
}
static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
}
static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
}
static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
{
int p2 = pix[-3*xstride];
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
int q0 = pix[ 0*xstride];
int q1 = pix[ 1*xstride];
int q2 = pix[ 2*xstride];
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
{
if( abs( p0 - q0 ) < ((alpha >> 2) + 2) )
{
if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
{
const int p3 = pix[-4*xstride];
pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
}
else /* p0' */
pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
{
const int q3 = pix[3*xstride];
pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
}
else /* q0' */
pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
}
else /* p0', q0' */
{
pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
}
}
}
static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < 16; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
}
static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < 8; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, 1, alpha, beta );
}
static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, stride, 1, alpha, beta );
}
static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
int q0 = pix[ 0*xstride];
int q1 = pix[ 1*xstride];
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
{
pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
}
static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < height; d++, pix += ystride-2 )
for( int e = 0; e < width; e++, pix++ )
deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
}
static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
}
static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
}
static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
}
static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
}
static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
int bframe )
{
for( int dir = 0; dir < 2; dir++ )
{
int s1 = dir ? 1 : 8;
int s2 = dir ? 8 : 1;
for( int edge = 0; edge < 4; edge++ )
for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
{
int locn = loc - s2;
if( nnz[loc] || nnz[locn] )
bs[dir][edge][i] = 2;
else if( ref[0][loc] != ref[0][locn] ||
abs( mv[0][loc][0] - mv[0][locn][0] ) >= 4 ||
abs( mv[0][loc][1] - mv[0][locn][1] ) >= mvy_limit ||
(bframe && (ref[1][loc] != ref[1][locn] ||
abs( mv[1][loc][0] - mv[1][locn][0] ) >= 4 ||
abs( mv[1][loc][1] - mv[1][locn][1] ) >= mvy_limit )))
{
bs[dir][edge][i] = 1;
}
else
bs[dir][edge][i] = 0;
}
}
}
static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp + a;
int index_b = i_qp + b;
int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
int beta = beta_table(index_b) << (BIT_DEPTH-8);
int8_t tc[4];
if( !M32(bS) || !alpha || !beta )
return;
tc[0] = (tc0_table(index_a)[bS[0]] * (1 << (BIT_DEPTH-8))) + b_chroma;
tc[1] = (tc0_table(index_a)[bS[1]] * (1 << (BIT_DEPTH-8))) + b_chroma;
tc[2] = (tc0_table(index_a)[bS[2]] * (1 << (BIT_DEPTH-8))) + b_chroma;
tc[3] = (tc0_table(index_a)[bS[3]] * (1 << (BIT_DEPTH-8))) + b_chroma;
pf_inter( pix, i_stride, alpha, beta, tc );
}
static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
{
int index_a = i_qp + a;
int index_b = i_qp + b;
int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
int beta = beta_table(index_b) << (BIT_DEPTH-8);
if( !alpha || !beta )
return;
pf_intra( pix, i_stride, alpha, beta );
}
static ALWAYS_INLINE void macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
{
int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
h->mb.i_neighbour = 0;
h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy];
h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED);
h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y;
h->mb.i_mb_left_xy[1] =
h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
if( SLICE_MBAFF )
{
if( mb_y&1 )
{
if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
}
else
{
if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] )
{
h->mb.i_mb_top_xy += h->mb.i_mb_stride;
h->mb.i_mb_top_y++;
}
if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
}
}
if( mb_x > 0 && (deblock_on_slice_edges ||
h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) )
h->mb.i_neighbour |= MB_LEFT;
if( mb_y > MB_INTERLACED && (deblock_on_slice_edges
|| h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) )
h->mb.i_neighbour |= MB_TOP;
}
void x264_frame_deblock_row( x264_t *h, int mb_y )
{
int b_interlaced = SLICE_MBAFF;
int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
int b = h->sh.i_beta_offset - QP_BD_OFFSET;
int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
int stridey = h->fdec->i_stride[0];
int strideuv = h->fdec->i_stride[1];
int chroma_format = CHROMA_FORMAT;
int chroma444 = CHROMA444;
int chroma_height = 16 >> CHROMA_V_SHIFT;
intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
{
x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
int mb_xy = h->mb.i_mb_xy;
int transform_8x8 = h->mb.mb_transform_size[mb_xy];
int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x];
pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
pixel *pixuv = CHROMA_FORMAT ? h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x : NULL;
if( mb_y & MB_INTERLACED )
{
pixy -= 15*stridey;
if( CHROMA_FORMAT )
pixuv -= (chroma_height-1)*strideuv;
}
int stride2y = stridey << MB_INTERLACED;
int stride2uv = strideuv << MB_INTERLACED;
int qp = h->mb.qp[mb_xy];
int qpc = h->chroma_qp_table[qp];
int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh;
#define FILTER( intra, dir, edge, qp, chroma_qp )\
do\
{\
if( !(edge & 1) || !transform_8x8 )\
{\
deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
stride2y, bs[dir][edge], qp, a, b, 0,\
h->loopf.deblock_luma##intra[dir] );\
if( chroma_format == CHROMA_444 )\
{\
deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\
stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
h->loopf.deblock_luma##intra[dir] );\
deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
h->loopf.deblock_luma##intra[dir] );\
}\
else if( chroma_format == CHROMA_420 && !(edge & 1) )\
{\
deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
h->loopf.deblock_chroma##intra[dir] );\
}\
}\
if( chroma_format == CHROMA_422 && (dir || !(edge & 1)) )\
{\
deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
h->loopf.deblock_chroma##intra[dir] );\
}\
} while( 0 )
if( h->mb.i_neighbour & MB_LEFT )
{
if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
{
int luma_qp[2];
int chroma_qp[2];
int left_qp[2];
x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
int c = chroma444 ? 0 : 1;
left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
luma_qp[0] = (qp + left_qp[0] + 1) >> 1;
chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1;
if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] ) )
{
deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_intra_deblock );
if( chroma_format )
{
deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
if( chroma444 )
deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
}
}
else
{
deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_deblock );
if( chroma_format )
{
deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
if( chroma444 )
deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
}
}
int offy = MB_INTERLACED ? 4 : 0;
int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0;
left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[1]] ) )
{
deblock_edge_intra( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], a, b, 0, luma_intra_deblock );
if( chroma_format )
{
deblock_edge_intra( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
if( chroma444 )
deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
}
}
else
{
deblock_edge( h, pixy + (stridey<<offy), 2*stridey, bs[0][4], luma_qp[1], a, b, 0, luma_deblock );
if( chroma_format )
{
deblock_edge( h, pixuv + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
if( chroma444 )
deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
}
}
}
else
{
int qpl = h->mb.qp[h->mb.i_mb_xy-1];
int qp_left = (qp + qpl + 1) >> 1;
int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1;
int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] );
int intra_deblock = intra_cur || intra_left;
/* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP.
* But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there.
* So reset their effective QP to max, to indicate that lack of guarantee. */
if( h->fdec->mb_info && M32( bs[0][0] ) )
{
#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT);
RESET_EFFECTIVE_QP(mb_xy);
RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]);
}
if( intra_deblock )
FILTER( _intra, 0, 0, qp_left, qpc_left );
else
FILTER( , 0, 0, qp_left, qpc_left );
}
}
if( !first_edge_only )
{
FILTER( , 0, 1, qp, qpc );
FILTER( , 0, 2, qp, qpc );
FILTER( , 0, 3, qp, qpc );
}
if( h->mb.i_neighbour & MB_TOP )
{
if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] )
{
int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride;
for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
{
int qpt = h->mb.qp[mbn_xy];
int qp_top = (qp + qpt + 1) >> 1;
int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
int intra_top = IS_INTRA( h->mb.type[mbn_xy] );
if( intra_cur || intra_top )
M32( bs[1][4*j] ) = 0x03030303;
// deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows
deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, a, b, 0, h->loopf.deblock_luma[1] );
if( chroma444 )
{
deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
}
else if( chroma_format )
deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, h->loopf.deblock_chroma[1] );
}
}
else
{
int qpt = h->mb.qp[h->mb.i_mb_top_xy];
int qp_top = (qp + qpt + 1) >> 1;
int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
int intra_deblock = intra_cur || intra_top;
/* This edge has been modified, reset effective qp to max. */
if( h->fdec->mb_info && M32( bs[1][0] ) )
{
RESET_EFFECTIVE_QP(mb_xy);
RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy);
}
if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) && intra_deblock )
{
FILTER( _intra, 1, 0, qp_top, qpc_top );
}
else
{
if( intra_deblock )
M32( bs[1][0] ) = 0x03030303;
FILTER( , 1, 0, qp_top, qpc_top );
}
}
}
if( !first_edge_only )
{
FILTER( , 1, 1, qp, qpc );
FILTER( , 1, 2, qp, qpc );
FILTER( , 1, 3, qp, qpc );
}
#undef FILTER
}
}
/* For deblock-aware RD.
* TODO:
* deblock macroblock edges
* support analysis partitions smaller than 16x16
* deblock chroma for 4:2:0/4:2:2
* handle duplicate refs correctly
*/
void x264_macroblock_deblock( x264_t *h )
{
int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
int b = h->sh.i_beta_offset - QP_BD_OFFSET;
int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
int intra_cur = IS_INTRA( h->mb.i_type );
int qp = h->mb.i_qp;
int qpc = h->mb.i_chroma_qp;
if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh )
return;
uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
if( intra_cur )
{
M32( bs[0][1] ) = 0x03030303;
M64( bs[0][2] ) = 0x0303030303030303ULL;
M32( bs[1][1] ) = 0x03030303;
M64( bs[1][2] ) = 0x0303030303030303ULL;
}
else
h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );
int transform_8x8 = h->mb.b_transform_8x8;
#define FILTER( dir, edge )\
do\
{\
deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\
FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\
h->loopf.deblock_luma[dir] );\
if( CHROMA444 )\
{\
deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\
FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
h->loopf.deblock_luma[dir] );\
deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\
FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
h->loopf.deblock_luma[dir] );\
}\
} while( 0 )
if( !transform_8x8 ) FILTER( 0, 1 );
FILTER( 0, 2 );
if( !transform_8x8 ) FILTER( 0, 3 );
if( !transform_8x8 ) FILTER( 1, 1 );
FILTER( 1, 2 );
if( !transform_8x8 ) FILTER( 1, 3 );
#undef FILTER
}
#if HAVE_MMX
#include "x86/deblock.h"
#endif
#if HAVE_ALTIVEC
#include "ppc/deblock.h"
#endif
#if HAVE_ARMV6
#include "arm/deblock.h"
#endif
#if HAVE_AARCH64
#include "aarch64/deblock.h"
#endif
#if HAVE_MSA
#include "mips/deblock.h"
#endif
#if HAVE_LSX
#include "loongarch/deblock.h"
#endif
void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
{
pf->deblock_luma[1] = deblock_v_luma_c;
pf->deblock_luma[0] = deblock_h_luma_c;
pf->deblock_chroma[1] = deblock_v_chroma_c;
pf->deblock_h_chroma_420 = deblock_h_chroma_c;
pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
pf->deblock_strength = deblock_strength_c;
#if HAVE_MMX
if( cpu&X264_CPU_MMX2 )
{
#if ARCH_X86
pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
#if !HIGH_BIT_DEPTH
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
#if HIGH_BIT_DEPTH
pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2;
#endif
}
}
if( cpu&X264_CPU_SSSE3 )
pf->deblock_strength = x264_deblock_strength_ssse3;
if( cpu&X264_CPU_AVX )
{
pf->deblock_strength = x264_deblock_strength_avx;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
#if HIGH_BIT_DEPTH
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx;
#endif
}
}
if( cpu&X264_CPU_AVX2 )
{
pf->deblock_strength = x264_deblock_strength_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->deblock_strength = x264_deblock_strength_avx512;
}
}
#endif
#if !HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
{
pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
}
#endif // HAVE_ALTIVEC
#if HAVE_ARMV6 || HAVE_AARCH64
if( cpu&X264_CPU_NEON )
{
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
pf->deblock_strength = x264_deblock_strength_neon;
}
#if HAVE_SVE
if ( cpu&X264_CPU_SVE )
{
pf->deblock_chroma[1] = x264_deblock_v_chroma_sve;
}
#endif
#endif
#if HAVE_MSA
if( cpu&X264_CPU_MSA )
{
pf->deblock_luma[1] = x264_deblock_v_luma_msa;
pf->deblock_luma[0] = x264_deblock_h_luma_msa;
pf->deblock_chroma[1] = x264_deblock_v_chroma_msa;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa;
pf->deblock_strength = x264_deblock_strength_msa;
}
#endif
#if HAVE_LSX
if( cpu&X264_CPU_LSX )
{
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lsx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lsx;
pf->deblock_strength = x264_deblock_strength_lsx;
}
if( cpu&X264_CPU_LASX )
{
pf->deblock_luma[1] = x264_deblock_v_luma_lasx;
pf->deblock_luma[0] = x264_deblock_h_luma_lasx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lasx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lasx;
pf->deblock_strength = x264_deblock_strength_lasx;
}
#endif
#endif // !HIGH_BIT_DEPTH
/* These functions are equivalent, so don't duplicate them. */
pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420;
pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra;
}

898
common/frame.c Normal file
View File

@@ -0,0 +1,898 @@
/*****************************************************************************
* frame.c: frame handling
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common.h"
static int align_stride( int x, int align, int disalign )
{
x = ALIGN( x, align );
if( !(x&(disalign-1)) )
x += align;
return x;
}
static int align_plane_size( int x, int disalign )
{
if( !(x&(disalign-1)) )
x += X264_MAX( 128, NATIVE_ALIGN ) / SIZEOF_PIXEL;
return x;
}
static int frame_internal_csp( int external_csp )
{
int csp = external_csp & X264_CSP_MASK;
if( csp == X264_CSP_I400 )
return X264_CSP_I400;
if( csp >= X264_CSP_I420 && csp < X264_CSP_I422 )
return X264_CSP_NV12;
if( csp >= X264_CSP_I422 && csp < X264_CSP_I444 )
return X264_CSP_NV16;
if( csp >= X264_CSP_I444 && csp <= X264_CSP_RGB )
return X264_CSP_I444;
return X264_CSP_NONE;
}
static x264_frame_t *frame_new( x264_t *h, int b_fdec )
{
x264_frame_t *frame;
int i_csp = frame_internal_csp( h->param.i_csp );
int i_mb_count = h->mb.i_mb_count;
int i_stride, i_width, i_lines, luma_plane_count;
int i_padv = PADV << PARAM_INTERLACED;
int align = NATIVE_ALIGN / SIZEOF_PIXEL;
#if ARCH_X86 || ARCH_X86_64
if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
align = 64 / SIZEOF_PIXEL;
else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
align = 32 / SIZEOF_PIXEL;
else
align = 16 / SIZEOF_PIXEL;
#endif
#if ARCH_PPC
int disalign = (1<<9) / SIZEOF_PIXEL;
#else
int disalign = (1<<10) / SIZEOF_PIXEL;
#endif
CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
PREALLOC_INIT
/* allocate frame data (+64 for extra data for me) */
i_width = h->mb.i_mb_width*16;
i_lines = h->mb.i_mb_height*16;
i_stride = align_stride( i_width + PADH2, align, disalign );
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
luma_plane_count = 1;
frame->i_plane = 2;
for( int i = 0; i < 2; i++ )
{
frame->i_width[i] = i_width >> i;
frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
frame->i_stride[i] = i_stride;
}
}
else if( i_csp == X264_CSP_I444 )
{
luma_plane_count = 3;
frame->i_plane = 3;
for( int i = 0; i < 3; i++ )
{
frame->i_width[i] = i_width;
frame->i_lines[i] = i_lines;
frame->i_stride[i] = i_stride;
}
}
else if( i_csp == X264_CSP_I400 )
{
luma_plane_count = 1;
frame->i_plane = 1;
frame->i_width[0] = i_width;
frame->i_lines[0] = i_lines;
frame->i_stride[0] = i_stride;
}
else
goto fail;
frame->i_csp = i_csp;
frame->i_width_lowres = frame->i_width[0]/2;
frame->i_lines_lowres = frame->i_lines[0]/2;
frame->i_stride_lowres = align_stride( frame->i_width_lowres + PADH2, align, disalign<<1 );
for( int i = 0; i < h->param.i_bframe + 2; i++ )
for( int j = 0; j < h->param.i_bframe + 2; j++ )
PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
frame->i_poc = -1;
frame->i_type = X264_TYPE_AUTO;
frame->i_qpplus1 = X264_QP_AUTO;
frame->i_pts = -1;
frame->i_frame = -1;
frame->i_frame_num = -1;
frame->i_lines_completed = -1;
frame->b_fdec = b_fdec;
frame->i_pic_struct = PIC_STRUCT_AUTO;
frame->i_field_cnt = -1;
frame->i_duration =
frame->i_cpb_duration =
frame->i_dpb_output_delay =
frame->i_cpb_delay = 0;
frame->i_coded_fields_lookahead =
frame->i_cpb_delay_lookahead = -1;
frame->orig = frame;
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
PREALLOC( frame->buffer[1], chroma_plane_size * SIZEOF_PIXEL );
if( PARAM_INTERLACED )
PREALLOC( frame->buffer_fld[1], chroma_plane_size * SIZEOF_PIXEL );
}
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
for( int p = 0; p < luma_plane_count; p++ )
{
int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
if( h->param.analyse.i_subpel_refine && b_fdec )
luma_plane_size *= 4;
/* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
PREALLOC( frame->buffer[p], luma_plane_size * SIZEOF_PIXEL );
if( PARAM_INTERLACED )
PREALLOC( frame->buffer_fld[p], luma_plane_size * SIZEOF_PIXEL );
}
frame->b_duplicate = 0;
if( b_fdec ) /* fdec frame */
{
PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) );
PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) );
PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
if( h->param.i_bframe )
{
PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
}
else
{
frame->mv[1] = NULL;
frame->ref[1] = NULL;
}
PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
if( h->param.analyse.i_me_method >= X264_ME_ESA )
PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
if( PARAM_INTERLACED )
PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
if( h->param.analyse.b_mb_info )
PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
}
else /* fenc frame */
{
if( h->frames.b_have_lowres )
{
int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
PREALLOC( frame->buffer_lowres, 4 * luma_plane_size * SIZEOF_PIXEL );
for( int j = 0; j <= !!h->param.i_bframe; j++ )
for( int i = 0; i <= h->param.i_bframe; i++ )
{
PREALLOC( frame->lowres_mvs[j][i], 2*i_mb_count*sizeof(int16_t) );
PREALLOC( frame->lowres_mv_costs[j][i], i_mb_count*sizeof(int) );
}
PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
}
if( h->param.rc.i_aq_mode )
{
PREALLOC( frame->f_qp_offset, i_mb_count * sizeof(float) );
PREALLOC( frame->f_qp_offset_aq, i_mb_count * sizeof(float) );
if( h->frames.b_have_lowres )
PREALLOC( frame->i_inv_qscale_factor, i_mb_count * sizeof(uint16_t) );
}
/* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
if( h->frames.b_have_lowres )
prealloc_size += NATIVE_ALIGN;
}
PREALLOC_END( frame->base );
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
if( PARAM_INTERLACED )
frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
}
for( int p = 0; p < luma_plane_count; p++ )
{
int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
if( h->param.analyse.i_subpel_refine && b_fdec )
{
for( int i = 0; i < 4; i++ )
{
frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
if( PARAM_INTERLACED )
frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
}
frame->plane[p] = frame->filtered[p][0];
frame->plane_fld[p] = frame->filtered_fld[p][0];
}
else
{
frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
if( PARAM_INTERLACED )
frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
}
}
if( b_fdec )
{
M32( frame->mv16x16[0] ) = 0;
frame->mv16x16++;
if( h->param.analyse.i_me_method >= X264_ME_ESA )
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH_ALIGN;
}
else
{
if( h->frames.b_have_lowres )
{
int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
for( int i = 0; i < 4; i++ )
frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH_ALIGN + i * luma_plane_size;
for( int j = 0; j <= !!h->param.i_bframe; j++ )
for( int i = 0; i <= h->param.i_bframe; i++ )
memset( frame->lowres_mvs[j][i], 0, 2*i_mb_count*sizeof(int16_t) );
frame->i_intra_cost = frame->lowres_costs[0][0];
memset( frame->i_intra_cost, -1, i_mb_count * sizeof(uint16_t) );
if( h->param.rc.i_aq_mode )
/* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
memset( frame->i_inv_qscale_factor, 0, i_mb_count * sizeof(uint16_t) );
}
}
if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
goto fail;
if( x264_pthread_cond_init( &frame->cv, NULL ) )
goto fail;
#if HAVE_OPENCL
frame->opencl.ocl = h->opencl.ocl;
#endif
return frame;
fail:
x264_free( frame );
return NULL;
}
void x264_frame_delete( x264_frame_t *frame )
{
/* Duplicate frames are blank copies of real frames (including pointers),
* so freeing those pointers would cause a double free later. */
if( !frame->b_duplicate )
{
x264_free( frame->base );
if( frame->param && frame->param->param_free )
{
x264_param_cleanup( frame->param );
frame->param->param_free( frame->param );
}
if( frame->mb_info_free )
frame->mb_info_free( frame->mb_info );
if( frame->extra_sei.sei_free )
{
for( int i = 0; i < frame->extra_sei.num_payloads; i++ )
frame->extra_sei.sei_free( frame->extra_sei.payloads[i].payload );
frame->extra_sei.sei_free( frame->extra_sei.payloads );
}
x264_pthread_mutex_destroy( &frame->mutex );
x264_pthread_cond_destroy( &frame->cv );
#if HAVE_OPENCL
x264_opencl_frame_delete( frame );
#endif
}
x264_free( frame );
}
static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
{
int width = h->param.i_width >> xshift;
int height = h->param.i_height >> yshift;
*pix = src->img.plane[plane];
*stride = src->img.i_stride[plane];
if( src->img.i_csp & X264_CSP_VFLIP )
{
*pix += (height-1) * *stride;
*stride = -*stride;
}
if( width > abs(*stride) )
{
x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
return -1;
}
return 0;
}
#define get_plane_ptr(...) do { if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; } while( 0 )
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
{
int i_csp = src->img.i_csp & X264_CSP_MASK;
if( dst->i_csp != frame_internal_csp( i_csp ) )
{
x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" );
return -1;
}
#if HIGH_BIT_DEPTH
if( !(src->img.i_csp & X264_CSP_HIGH_DEPTH) )
{
x264_log( h, X264_LOG_ERROR, "This build of x264 requires high depth input. Rebuild to support 8-bit input.\n" );
return -1;
}
#else
if( src->img.i_csp & X264_CSP_HIGH_DEPTH )
{
x264_log( h, X264_LOG_ERROR, "This build of x264 requires 8-bit input. Rebuild to support high depth input.\n" );
return -1;
}
#endif
if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
{
x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
return -1;
}
if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME )
{
x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input );
dst->i_forced_type = X264_TYPE_AUTO;
}
else
dst->i_forced_type = src->i_type;
dst->i_type = dst->i_forced_type;
dst->i_qpplus1 = src->i_qpplus1;
dst->i_pts = dst->i_reordered_pts = src->i_pts;
dst->param = src->param;
dst->i_pic_struct = src->i_pic_struct;
dst->extra_sei = src->extra_sei;
dst->opaque = src->opaque;
dst->mb_info = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL;
dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL;
uint8_t *pix[3];
int stride[3];
if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY )
{
int p = i_csp == X264_CSP_UYVY;
h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1],
(pixel*)src->img.plane[0], src->img.i_stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
}
else if( i_csp == X264_CSP_V210 )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
dst->plane[1], dst->i_stride[1],
(uint32_t *)pix[0], stride[0]/(int)sizeof(uint32_t), h->param.i_width, h->param.i_height );
}
else if( i_csp >= X264_CSP_BGR )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
if( src->img.i_csp & X264_CSP_VFLIP )
{
pix[0] += (h->param.i_height-1) * stride[0];
stride[0] = -stride[0];
}
int b = i_csp==X264_CSP_RGB;
h->mc.plane_copy_deinterleave_rgb( dst->plane[1+b], dst->i_stride[1+b],
dst->plane[0], dst->i_stride[0],
dst->plane[2-b], dst->i_stride[2-b],
(pixel*)pix[0], stride[0]/SIZEOF_PIXEL, i_csp==X264_CSP_BGRA ? 4 : 3, h->param.i_width, h->param.i_height );
}
else
{
int v_shift = CHROMA_V_SHIFT;
get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height>>v_shift );
}
else if( i_csp == X264_CSP_NV21 )
{
get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
stride[1]/SIZEOF_PIXEL, h->param.i_width>>1, h->param.i_height>>v_shift );
}
else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
{
int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
(pixel*)pix[1], stride[1]/SIZEOF_PIXEL,
(pixel*)pix[2], stride[2]/SIZEOF_PIXEL,
h->param.i_width>>1, h->param.i_height>>v_shift );
}
else if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
{
get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 );
get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 );
h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2],
stride[2]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
}
}
return 0;
}
static ALWAYS_INLINE void pixel_memset( pixel *dst, pixel *src, int len, int size )
{
uint8_t *dstp = (uint8_t*)dst;
uint32_t v1 = *src;
uint32_t v2 = size == 1 ? v1 + (v1 << 8) : M16( src );
uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src );
int i = 0;
len *= size;
/* Align the input pointer if it isn't already */
if( (intptr_t)dstp & (WORD_SIZE - 1) )
{
if( size <= 2 && ((intptr_t)dstp & 3) )
{
if( size == 1 && ((intptr_t)dstp & 1) )
dstp[i++] = v1;
if( (intptr_t)dstp & 2 )
{
M16( dstp+i ) = v2;
i += 2;
}
}
if( WORD_SIZE == 8 && (intptr_t)dstp & 4 )
{
M32( dstp+i ) = v4;
i += 4;
}
}
/* Main copy loop */
if( WORD_SIZE == 8 )
{
uint64_t v8 = v4 + ((uint64_t)v4<<32);
for( ; i < len - 7; i+=8 )
M64( dstp+i ) = v8;
}
for( ; i < len - 3; i+=4 )
M32( dstp+i ) = v4;
/* Finish up the last few bytes */
if( size <= 2 )
{
if( i < len - 1 )
{
M16( dstp+i ) = v2;
i += 2;
}
if( size == 1 && i != len )
dstp[i] = v1;
}
}
static ALWAYS_INLINE void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
for( int y = 0; y < i_height; y++ )
{
/* left band */
pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, SIZEOF_PIXEL<<b_chroma );
/* right band */
pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, SIZEOF_PIXEL<<b_chroma );
}
/* upper band */
if( b_pad_top )
for( int y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * SIZEOF_PIXEL );
/* lower band */
if( b_pad_bottom )
for( int y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * SIZEOF_PIXEL );
#undef PPIXEL
}
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y )
{
int pad_top = mb_y == 0;
int pad_bot = mb_y == h->mb.i_mb_height - (1 << SLICE_MBAFF);
int b_start = mb_y == h->i_threadslice_start;
int b_end = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF);
if( mb_y & SLICE_MBAFF )
return;
for( int i = 0; i < frame->i_plane; i++ )
{
int h_shift = i && CHROMA_H_SHIFT;
int v_shift = i && CHROMA_V_SHIFT;
int stride = frame->i_stride[i];
int width = 16*h->mb.i_mb_width;
int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
int padh = PADH;
int padv = PADV >> v_shift;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
if( b_end && !b_start )
height += 4 >> (v_shift + SLICE_MBAFF);
pixel *pix;
int starty = 16*mb_y - 4*!b_start;
if( SLICE_MBAFF )
{
// border samples for each field are extended separately
pix = frame->plane_fld[i] + (starty*stride >> v_shift);
plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
if( b_end && !b_start )
height += 4 >> v_shift;
pix = frame->plane[i] + (starty*stride >> v_shift);
plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
}
else
{
pix = frame->plane[i] + (starty*stride >> v_shift);
plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
}
}
}
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
/* during filtering, 8 extra pixels were filtered on each edge,
* but up to 3 of the horizontal ones may be wrong.
we want to expand border from the last filtered pixel */
int b_start = !mb_y;
int width = 16*h->mb.i_mb_width + 8;
int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16;
int padh = PADH - 4;
int padv = PADV - 8;
for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
for( int i = 1; i < 4; i++ )
{
int stride = frame->i_stride[p];
// buffer: 8 luma, to match the hpel filter
pixel *pix;
if( SLICE_MBAFF )
{
pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4;
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
}
pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4;
plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
}
}
void x264_frame_expand_border_lowres( x264_frame_t *frame )
{
for( int i = 0; i < 4; i++ )
plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
}
void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
{
int v_shift = CHROMA_V_SHIFT;
plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
{
for( int i = 0; i < frame->i_plane; i++ )
{
int i_width = h->param.i_width;
int h_shift = i && CHROMA_H_SHIFT;
int v_shift = i && CHROMA_V_SHIFT;
int i_height = h->param.i_height >> v_shift;
int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
if( i_padx )
{
for( int y = 0; y < i_height; y++ )
pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
&frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
i_padx>>h_shift, SIZEOF_PIXEL<<h_shift );
}
if( i_pady )
{
for( int y = i_height; y < i_height + i_pady; y++ )
memcpy( &frame->plane[i][y*frame->i_stride[i]],
&frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]],
(i_width + i_padx) * SIZEOF_PIXEL );
}
}
}
void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
{
for( int i = 0; i < h->fenc->i_plane; i++ )
{
int v_shift = i && CHROMA_V_SHIFT;
int stride = h->fenc->i_stride[i];
int height = h->param.i_height >> v_shift;
int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
pixel *fenc = h->fenc->plane[i] + 16*mb_x;
for( int y = height; y < height + pady; y++ )
memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*SIZEOF_PIXEL );
}
}
/* threading */
void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
{
x264_pthread_mutex_lock( &frame->mutex );
frame->i_lines_completed = i_lines_completed;
x264_pthread_cond_broadcast( &frame->cv );
x264_pthread_mutex_unlock( &frame->mutex );
}
int x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
{
int completed;
x264_pthread_mutex_lock( &frame->mutex );
while( (completed = frame->i_lines_completed) < i_lines_completed && i_lines_completed >= 0 )
x264_pthread_cond_wait( &frame->cv, &frame->mutex );
x264_pthread_mutex_unlock( &frame->mutex );
return completed;
}
void x264_threadslice_cond_broadcast( x264_t *h, int pass )
{
x264_pthread_mutex_lock( &h->mutex );
h->i_threadslice_pass = pass;
if( pass > 0 )
x264_pthread_cond_broadcast( &h->cv );
x264_pthread_mutex_unlock( &h->mutex );
}
void x264_threadslice_cond_wait( x264_t *h, int pass )
{
x264_pthread_mutex_lock( &h->mutex );
while( h->i_threadslice_pass < pass )
x264_pthread_cond_wait( &h->cv, &h->mutex );
x264_pthread_mutex_unlock( &h->mutex );
}
int x264_frame_new_slice( x264_t *h, x264_frame_t *frame )
{
if( h->param.i_slice_count_max )
{
int slice_count;
if( h->param.b_sliced_threads )
slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex );
else
slice_count = frame->i_slice_count++;
if( slice_count >= h->param.i_slice_count_max )
return -1;
}
return 0;
}
/* list operators */
void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
{
int i = 0;
while( list[i] ) i++;
list[i] = frame;
}
x264_frame_t *x264_frame_pop( x264_frame_t **list )
{
x264_frame_t *frame;
int i = 0;
assert( list[0] );
while( list[i+1] ) i++;
frame = list[i];
list[i] = NULL;
return frame;
}
void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
{
int i = 0;
while( list[i] ) i++;
while( i-- )
list[i+1] = list[i];
list[0] = frame;
}
x264_frame_t *x264_frame_shift( x264_frame_t **list )
{
x264_frame_t *frame = list[0];
int i;
for( i = 0; list[i]; i++ )
list[i] = list[i+1];
assert(frame);
return frame;
}
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
{
assert( frame->i_reference_count > 0 );
frame->i_reference_count--;
if( frame->i_reference_count == 0 )
x264_frame_push( h->frames.unused[frame->b_fdec], frame );
}
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
{
x264_frame_t *frame;
if( h->frames.unused[b_fdec][0] )
frame = x264_frame_pop( h->frames.unused[b_fdec] );
else
frame = frame_new( h, b_fdec );
if( !frame )
return NULL;
frame->b_last_minigop_bframe = 0;
frame->i_reference_count = 1;
frame->b_intra_calculated = 0;
frame->b_scenecut = 1;
frame->b_keyframe = 0;
frame->b_corrupt = 0;
frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1;
memset( frame->weight, 0, sizeof(frame->weight) );
memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
return frame;
}
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
{
assert( frame->i_reference_count > 0 );
frame->i_reference_count--;
if( frame->i_reference_count == 0 )
x264_frame_push( h->frames.blank_unused, frame );
}
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
{
x264_frame_t *frame;
if( h->frames.blank_unused[0] )
frame = x264_frame_pop( h->frames.blank_unused );
else
frame = x264_malloc( sizeof(x264_frame_t) );
if( !frame )
return NULL;
frame->b_duplicate = 1;
frame->i_reference_count = 1;
return frame;
}
void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
int i_width, int i_height, x264_weight_t *w )
{
/* Weight horizontal strips of height 16. This was found to be the optimal height
* in terms of the cache loads. */
while( i_height > 0 )
{
int x;
for( x = 0; x < i_width-8; x += 16 )
w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
if( x < i_width )
w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
i_height -= 16;
dst += 16 * i_dst_stride;
src += 16 * i_src_stride;
}
}
void x264_frame_delete_list( x264_frame_t **list )
{
int i = 0;
if( !list )
return;
while( list[i] )
x264_frame_delete( list[i++] );
x264_free( list );
}
int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int max_size )
{
if( max_size < 0 )
return -1;
slist->i_max_size = max_size;
slist->i_size = 0;
CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
x264_pthread_cond_init( &slist->cv_empty, NULL ) )
return -1;
return 0;
fail:
return -1;
}
void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist )
{
x264_pthread_mutex_destroy( &slist->mutex );
x264_pthread_cond_destroy( &slist->cv_fill );
x264_pthread_cond_destroy( &slist->cv_empty );
x264_frame_delete_list( slist->list );
}
void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame )
{
x264_pthread_mutex_lock( &slist->mutex );
while( slist->i_size == slist->i_max_size )
x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
slist->list[ slist->i_size++ ] = frame;
x264_pthread_mutex_unlock( &slist->mutex );
x264_pthread_cond_broadcast( &slist->cv_fill );
}
x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist )
{
x264_frame_t *frame;
x264_pthread_mutex_lock( &slist->mutex );
while( !slist->i_size )
x264_pthread_cond_wait( &slist->cv_fill, &slist->mutex );
frame = slist->list[ --slist->i_size ];
slist->list[ slist->i_size ] = NULL;
x264_pthread_cond_broadcast( &slist->cv_empty );
x264_pthread_mutex_unlock( &slist->mutex );
return frame;
}

297
common/frame.h Normal file
View File

@@ -0,0 +1,297 @@
/*****************************************************************************
* frame.h: frame handling
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_FRAME_H
#define X264_FRAME_H
/* number of pixels past the edge of the frame, for motion estimation/compensation */
#define PADH 32
#define PADV 32
#define PADH_ALIGN X264_MAX( PADH, NATIVE_ALIGN / SIZEOF_PIXEL )
#define PADH2 (PADH_ALIGN + PADH)
typedef struct x264_frame
{
/* */
uint8_t *base; /* Base pointer for all malloced data in this frame. */
int i_poc;
int i_delta_poc[2];
int i_type;
int i_forced_type;
int i_qpplus1;
int64_t i_pts;
int64_t i_dts;
int64_t i_reordered_pts;
int64_t i_duration; /* in SPS time_scale units (i.e 2 * timebase units) used for vfr */
float f_duration; /* in seconds */
int64_t i_cpb_duration;
int64_t i_cpb_delay; /* in SPS time_scale units (i.e 2 * timebase units) */
int64_t i_dpb_output_delay;
x264_param_t *param;
int i_frame; /* Presentation frame number */
int i_coded; /* Coded frame number */
int64_t i_field_cnt; /* Presentation field count */
int i_frame_num; /* 7.4.3 frame_num */
int b_kept_as_ref;
int i_pic_struct;
int b_keyframe;
uint8_t b_fdec;
uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
uint8_t i_bframes; /* number of bframes following this nonb in coded order */
float f_qp_avg_rc; /* QPs as decided by ratecontrol */
float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
float f_crf_avg; /* Average effective CRF for this frame */
int i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
/* YUV buffer */
int i_csp; /* Internal csp */
int i_plane;
int i_stride[3];
int i_width[3];
int i_lines[3];
int i_stride_lowres;
int i_width_lowres;
int i_lines_lowres;
pixel *plane[3];
pixel *plane_fld[3];
pixel *filtered[3][4]; /* plane[0], H, V, HV */
pixel *filtered_fld[3][4];
pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
uint16_t *integral;
/* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */
pixel *buffer[4];
pixel *buffer_fld[4];
pixel *buffer_lowres;
x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */
int b_duplicate;
struct x264_frame *orig;
/* motion data */
int8_t *mb_type;
uint8_t *mb_partition;
int16_t (*mv[2])[2];
int16_t (*mv16x16)[2];
int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
uint8_t *field;
uint8_t *effective_qp;
/* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
* Doesn't need special addressing for intra cost because
* lists_used is guaranteed to be zero in that cast. */
uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
#define LOWRES_COST_MASK ((1<<14)-1)
#define LOWRES_COST_SHIFT 14
int *lowres_mv_costs[2][X264_BFRAME_MAX+1];
int8_t *ref[2];
int i_ref[2];
int ref_poc[2][X264_REF_MAX];
int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction
/* for adaptive B-frame decision.
* contains the SATD cost of the lowres frame encoded in various modes
* FIXME: how big an array do we need? */
int i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
int i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
int i_satd; // the i_cost_est of the selected frametype
int i_intra_mbs[X264_BFRAME_MAX+2];
int *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
int *i_row_satd;
int *i_row_bits;
float *f_row_qp;
float *f_row_qscale;
float *f_qp_offset;
float *f_qp_offset_aq;
int b_intra_calculated;
uint16_t *i_intra_cost;
uint16_t *i_propagate_cost;
uint16_t *i_inv_qscale_factor;
int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
float f_weighted_cost_delta[X264_BFRAME_MAX+2];
uint32_t i_pixel_sum[3];
uint64_t i_pixel_ssd[3];
/* hrd */
x264_hrd_t hrd_timing;
/* vbv */
uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
int i_planned_satd[X264_LOOKAHEAD_MAX+1];
double f_planned_cpb_duration[X264_LOOKAHEAD_MAX+1];
int64_t i_coded_fields_lookahead;
int64_t i_cpb_delay_lookahead;
/* threading */
int i_lines_completed; /* in pixels */
int i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv;
int i_slice_count; /* Atomically written to/read from with slice threads */
/* periodic intra refresh */
float f_pir_position;
int i_pir_start_col;
int i_pir_end_col;
int i_frames_since_pir;
/* interactive encoder control */
int b_corrupt;
/* user sei */
x264_sei_t extra_sei;
/* user data */
void *opaque;
/* user frame properties */
uint8_t *mb_info;
void (*mb_info_free)( void* );
#if HAVE_OPENCL
x264_frame_opencl_t opencl;
#endif
} x264_frame_t;
/* synchronized frame list */
typedef struct
{
x264_frame_t **list;
int i_max_size;
int i_size;
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv_fill; /* event signaling that the list became fuller */
x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */
} x264_sync_frame_list_t;
typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta );
typedef struct
{
x264_deblock_inter_t deblock_luma[2];
x264_deblock_inter_t deblock_chroma[2];
x264_deblock_inter_t deblock_h_chroma_420;
x264_deblock_inter_t deblock_h_chroma_422;
x264_deblock_intra_t deblock_luma_intra[2];
x264_deblock_intra_t deblock_chroma_intra[2];
x264_deblock_intra_t deblock_h_chroma_420_intra;
x264_deblock_intra_t deblock_h_chroma_422_intra;
x264_deblock_inter_t deblock_luma_mbaff;
x264_deblock_inter_t deblock_chroma_mbaff;
x264_deblock_inter_t deblock_chroma_420_mbaff;
x264_deblock_inter_t deblock_chroma_422_mbaff;
x264_deblock_intra_t deblock_luma_intra_mbaff;
x264_deblock_intra_t deblock_chroma_intra_mbaff;
x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
void (*deblock_strength)( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
int bframe );
} x264_deblock_function_t;
#define x264_frame_delete x264_template(frame_delete)
void x264_frame_delete( x264_frame_t *frame );
#define x264_frame_copy_picture x264_template(frame_copy_picture)
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
#define x264_frame_expand_border x264_template(frame_expand_border)
void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y );
#define x264_frame_expand_border_filtered x264_template(frame_expand_border_filtered)
void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
#define x264_frame_expand_border_lowres x264_template(frame_expand_border_lowres)
void x264_frame_expand_border_lowres( x264_frame_t *frame );
#define x264_frame_expand_border_chroma x264_template(frame_expand_border_chroma)
void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane );
#define x264_frame_expand_border_mod16 x264_template(frame_expand_border_mod16)
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
#define x264_expand_border_mbpair x264_template(expand_border_mbpair)
void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y );
#define x264_frame_deblock_row x264_template(frame_deblock_row)
void x264_frame_deblock_row( x264_t *h, int mb_y );
#define x264_macroblock_deblock x264_template(macroblock_deblock)
void x264_macroblock_deblock( x264_t *h );
#define x264_frame_filter x264_template(frame_filter)
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
#define x264_frame_init_lowres x264_template(frame_init_lowres)
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame );
#define x264_deblock_init x264_template(deblock_init)
void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff );
#define x264_frame_cond_broadcast x264_template(frame_cond_broadcast)
void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed );
#define x264_frame_cond_wait x264_template(frame_cond_wait)
int x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );
#define x264_frame_new_slice x264_template(frame_new_slice)
int x264_frame_new_slice( x264_t *h, x264_frame_t *frame );
#define x264_threadslice_cond_broadcast x264_template(threadslice_cond_broadcast)
void x264_threadslice_cond_broadcast( x264_t *h, int pass );
#define x264_threadslice_cond_wait x264_template(threadslice_cond_wait)
void x264_threadslice_cond_wait( x264_t *h, int pass );
#define x264_frame_push x264_template(frame_push)
X264_API void x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
#define x264_frame_pop x264_template(frame_pop)
X264_API x264_frame_t *x264_frame_pop( x264_frame_t **list );
#define x264_frame_unshift x264_template(frame_unshift)
X264_API void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
#define x264_frame_shift x264_template(frame_shift)
X264_API x264_frame_t *x264_frame_shift( x264_frame_t **list );
#define x264_frame_push_unused x264_template(frame_push_unused)
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
#define x264_frame_push_blank_unused x264_template(frame_push_blank_unused)
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
#define x264_frame_pop_blank_unused x264_template(frame_pop_blank_unused)
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
#define x264_weight_scale_plane x264_template(weight_scale_plane)
void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
int i_width, int i_height, x264_weight_t *w );
#define x264_frame_pop_unused x264_template(frame_pop_unused)
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
#define x264_frame_delete_list x264_template(frame_delete_list)
void x264_frame_delete_list( x264_frame_t **list );
#define x264_sync_frame_list_init x264_template(sync_frame_list_init)
int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int nelem );
#define x264_sync_frame_list_delete x264_template(sync_frame_list_delete)
void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist );
#define x264_sync_frame_list_push x264_template(sync_frame_list_push)
void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame );
#define x264_sync_frame_list_pop x264_template(sync_frame_list_pop)
x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist );
#endif

2016
common/loongarch/dct-a.S Normal file

File diff suppressed because it is too large Load Diff

95
common/loongarch/dct.h Normal file
View File

@@ -0,0 +1,95 @@
/*****************************************************************************
* dct.h: loongarch transform and zigzag
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Peng Zhou <zhoupeng@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_LOONGARCH_DCT_H
#define X264_LOONGARCH_DCT_H
#define x264_sub8x8_dct_lasx x264_template(sub8x8_dct_lasx)
void x264_sub8x8_dct_lasx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
#define x264_sub16x16_dct_lasx x264_template(sub16x16_dct_lasx)
void x264_sub16x16_dct_lasx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
#define x264_sub16x16_dct8_lasx x264_template(sub16x16_dct8_lasx)
void x264_sub16x16_dct8_lasx( int16_t pi_dct[4][64], uint8_t *p_pix1,
uint8_t *p_pix2 );
#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
#define x264_add8x8_idct_lasx x264_template(add8x8_idct_lasx)
void x264_add8x8_idct_lasx( uint8_t *p_dst, int16_t pi_dct[4][16] );
#define x264_add16x16_idct_lasx x264_template(add16x16_idct_lasx)
void x264_add16x16_idct_lasx( uint8_t *p_dst, int16_t pi_dct[16][16] );
#define x264_add8x8_idct8_lasx x264_template(add8x8_idct8_lasx)
void x264_add8x8_idct8_lasx( uint8_t *p_dst, int16_t pi_dct[64] );
#define x264_add8x8_idct_dc_lasx x264_template(add8x8_idct_dc_lasx)
void x264_add8x8_idct_dc_lasx( uint8_t *p_dst, int16_t dct[4] );
#define x264_add16x16_idct_dc_lasx x264_template(add16x16_idct_dc_lasx)
void x264_add16x16_idct_dc_lasx( uint8_t *p_dst, int16_t dct[16] );
#define x264_idct4x4dc_lasx x264_template(idct4x4dc_lasx)
void x264_idct4x4dc_lasx( int16_t d[16] );
#define x264_dct4x4dc_lasx x264_template(dct4x4dc_lasx)
void x264_dct4x4dc_lasx( int16_t d[16] );
#define x264_zigzag_scan_4x4_frame_lasx x264_template(zigzag_scan_4x4_frame_lasx)
void x264_zigzag_scan_4x4_frame_lasx( int16_t level[16], int16_t dct[16] );
#define x264_sub4x4_dct_lsx x264_template(sub4x4_dct_lsx)
void x264_sub4x4_dct_lsx( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
#define x264_sub8x8_dct_lsx x264_template(sub8x8_dct_lsx)
void x264_sub8x8_dct_lsx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
#define x264_sub16x16_dct_lsx x264_template(sub16x16_dct_lsx)
void x264_sub16x16_dct_lsx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
#define x264_sub16x16_dct8_lsx x264_template(sub16x16_dct8_lsx)
void x264_sub16x16_dct8_lsx( int16_t pi_dct[4][64], uint8_t *p_pix1,
uint8_t *p_pix2 );
#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
#define x264_add8x8_idct_lsx x264_template(add8x8_idct_lsx)
void x264_add8x8_idct_lsx( uint8_t *p_dst, int16_t pi_dct[4][16] );
#define x264_add16x16_idct_lsx x264_template(add16x16_idct_lsx)
void x264_add16x16_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16][16] );
#define x264_add8x8_idct8_lsx x264_template(add8x8_idct8_lsx)
void x264_add8x8_idct8_lsx( uint8_t *p_dst, int16_t pi_dct[64] );
#define x264_add8x8_idct_dc_lsx x264_template(add8x8_idct_dc_lsx)
void x264_add8x8_idct_dc_lsx( uint8_t *p_dst, int16_t dct[4] );
#define x264_add16x16_idct_dc_lsx x264_template(add16x16_idct_dc_lsx)
void x264_add16x16_idct_dc_lsx( uint8_t *p_dst, int16_t dct[16] );
#define x264_idct4x4dc_lsx x264_template(idct4x4dc_lsx)
void x264_idct4x4dc_lsx( int16_t d[16] );
#define x264_dct4x4dc_lsx x264_template(dct4x4dc_lsx)
void x264_dct4x4dc_lsx( int16_t d[16] );
#define x264_zigzag_scan_4x4_frame_lsx x264_template(zigzag_scan_4x4_frame_lsx)
void x264_zigzag_scan_4x4_frame_lsx( int16_t level[16], int16_t dct[16] );
#endif

1618
common/loongarch/deblock-a.S Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
/*****************************************************************************
* deblock.h: loongarch deblock
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Hao Chen <chenhao@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_LOONGARCH_DEBLOCK_H
#define X264_LOONGARCH_DEBLOCK_H
#if !HIGH_BIT_DEPTH
#define x264_deblock_v_luma_lasx x264_template(deblock_v_luma_lasx)
void x264_deblock_v_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_lasx x264_template(deblock_h_luma_lasx)
void x264_deblock_h_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_luma_intra_lsx x264_template(deblock_v_luma_intra_lsx)
void x264_deblock_v_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_lsx x264_template(deblock_h_luma_intra_lsx)
void x264_deblock_h_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_intra_lasx x264_template(deblock_v_luma_intra_lasx)
void x264_deblock_v_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_lasx x264_template(deblock_h_luma_intra_lasx)
void x264_deblock_h_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_strength_lsx x264_template(deblock_strength_lsx)
void x264_deblock_strength_lsx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_strength_lasx x264_template(deblock_strength_lasx)
void x264_deblock_strength_lasx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#endif
#endif

View File

@@ -0,0 +1,770 @@
/*********************************************************************
* Copyright (c) 2022-2024 Loongson Technology Corporation Limited
* Contributed by Xiwei Gu <guxiwei-hf@loongson.cn>
* Shiyou Yin <yinshiyou-hf@loongson.cn>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*********************************************************************/
/*
* This file is a LoongArch assembly helper file and available under ISC
* license. It provides a large number of macros and alias to simplify
* writing assembly code, especially for LSX and LASX optimizations.
*
* Any one can modify it or add new features for his/her own purposes.
* Contributing a patch will be appreciated as it might be useful for
* others as well. Send patches to loongson contributor mentioned above.
*
* MAJOR version: Usage changes, incompatible with previous version.
* MINOR version: Add new macros/functions, or bug fixes.
* MICRO version: Comment changes or implementation changes.
*/
#define LML_VERSION_MAJOR 0
#define LML_VERSION_MINOR 4
#define LML_VERSION_MICRO 0
#define ASM_PREF
#define DEFAULT_ALIGN 5
/*
*============================================================================
* macros for specific projetc, set them as needed.
* Following LoongML macros for your reference.
*============================================================================
*/
.macro function name, align=DEFAULT_ALIGN
.macro endfunc
jirl $r0, $r1, 0x0
.size ASM_PREF\name, . - ASM_PREF\name
.purgem endfunc
.endm
.text ;
.align \align ;
.globl ASM_PREF\name ;
.type ASM_PREF\name, @function ;
ASM_PREF\name: ;
.endm
.macro const name, align=DEFAULT_ALIGN
.macro endconst
.size \name, . - \name
.purgem endconst
.endm
.section .rodata
.align \align
\name:
.endm
/*
*============================================================================
* LoongArch register alias
*============================================================================
*/
#define a0 $a0
#define a1 $a1
#define a2 $a2
#define a3 $a3
#define a4 $a4
#define a5 $a5
#define a6 $a6
#define a7 $a7
#define t0 $t0
#define t1 $t1
#define t2 $t2
#define t3 $t3
#define t4 $t4
#define t5 $t5
#define t6 $t6
#define t7 $t7
#define t8 $t8
#define s0 $s0
#define s1 $s1
#define s2 $s2
#define s3 $s3
#define s4 $s4
#define s5 $s5
#define s6 $s6
#define s7 $s7
#define s8 $s8
#define zero $zero
#define sp $sp
#define ra $ra
#define fa0 $fa0
#define fa1 $fa1
#define fa2 $fa2
#define fa3 $fa3
#define fa4 $fa4
#define fa5 $fa5
#define fa6 $fa6
#define fa7 $fa7
#define ft0 $ft0
#define ft1 $ft1
#define ft2 $ft2
#define ft3 $ft3
#define ft4 $ft4
#define ft5 $ft5
#define ft6 $ft6
#define ft7 $ft7
#define ft8 $ft8
#define ft9 $ft9
#define ft10 $ft10
#define ft11 $ft11
#define ft12 $ft12
#define ft13 $ft13
#define ft14 $ft14
#define ft15 $ft15
#define fs0 $fs0
#define fs1 $fs1
#define fs2 $fs2
#define fs3 $fs3
#define fs4 $fs4
#define fs5 $fs5
#define fs6 $fs6
#define fs7 $fs7
#define f0 $f0
#define f1 $f1
#define f2 $f2
#define f3 $f3
#define f4 $f4
#define f5 $f5
#define f6 $f6
#define f7 $f7
#define f8 $f8
#define f9 $f9
#define f10 $f10
#define f11 $f11
#define f12 $f12
#define f13 $f13
#define f14 $f14
#define f15 $f15
#define f16 $f16
#define f17 $f17
#define f18 $f18
#define f19 $f19
#define f20 $f20
#define f21 $f21
#define f22 $f22
#define f23 $f23
#define f24 $f24
#define f25 $f25
#define f26 $f26
#define f27 $f27
#define f28 $f28
#define f29 $f29
#define f30 $f30
#define f31 $f31
#define vr0 $vr0
#define vr1 $vr1
#define vr2 $vr2
#define vr3 $vr3
#define vr4 $vr4
#define vr5 $vr5
#define vr6 $vr6
#define vr7 $vr7
#define vr8 $vr8
#define vr9 $vr9
#define vr10 $vr10
#define vr11 $vr11
#define vr12 $vr12
#define vr13 $vr13
#define vr14 $vr14
#define vr15 $vr15
#define vr16 $vr16
#define vr17 $vr17
#define vr18 $vr18
#define vr19 $vr19
#define vr20 $vr20
#define vr21 $vr21
#define vr22 $vr22
#define vr23 $vr23
#define vr24 $vr24
#define vr25 $vr25
#define vr26 $vr26
#define vr27 $vr27
#define vr28 $vr28
#define vr29 $vr29
#define vr30 $vr30
#define vr31 $vr31
#define xr0 $xr0
#define xr1 $xr1
#define xr2 $xr2
#define xr3 $xr3
#define xr4 $xr4
#define xr5 $xr5
#define xr6 $xr6
#define xr7 $xr7
#define xr8 $xr8
#define xr9 $xr9
#define xr10 $xr10
#define xr11 $xr11
#define xr12 $xr12
#define xr13 $xr13
#define xr14 $xr14
#define xr15 $xr15
#define xr16 $xr16
#define xr17 $xr17
#define xr18 $xr18
#define xr19 $xr19
#define xr20 $xr20
#define xr21 $xr21
#define xr22 $xr22
#define xr23 $xr23
#define xr24 $xr24
#define xr25 $xr25
#define xr26 $xr26
#define xr27 $xr27
#define xr28 $xr28
#define xr29 $xr29
#define xr30 $xr30
#define xr31 $xr31
/*
*============================================================================
* LSX/LASX synthesize instructions
*============================================================================
*/
/*
* Description : Dot product of byte vector elements
* Arguments : Inputs - vj, vk
* Outputs - vd
* Return Type - halfword
*/
.macro vdp2.h.bu vd, vj, vk
vmulwev.h.bu \vd, \vj, \vk
vmaddwod.h.bu \vd, \vj, \vk
.endm
.macro vdp2.h.bu.b vd, vj, vk
vmulwev.h.bu.b \vd, \vj, \vk
vmaddwod.h.bu.b \vd, \vj, \vk
.endm
.macro vdp2.w.h vd, vj, vk
vmulwev.w.h \vd, \vj, \vk
vmaddwod.w.h \vd, \vj, \vk
.endm
.macro xvdp2.h.bu xd, xj, xk
xvmulwev.h.bu \xd, \xj, \xk
xvmaddwod.h.bu \xd, \xj, \xk
.endm
.macro xvdp2.h.bu.b xd, xj, xk
xvmulwev.h.bu.b \xd, \xj, \xk
xvmaddwod.h.bu.b \xd, \xj, \xk
.endm
.macro xvdp2.w.h xd, xj, xk
xvmulwev.w.h \xd, \xj, \xk
xvmaddwod.w.h \xd, \xj, \xk
.endm
/*
* Description : Dot product & addition of halfword vector elements
* Arguments : Inputs - vj, vk
* Outputs - vd
* Return Type - twice size of input
*/
.macro vdp2add.h.bu vd, vj, vk
vmaddwev.h.bu \vd, \vj, \vk
vmaddwod.h.bu \vd, \vj, \vk
.endm
.macro vdp2add.h.bu.b vd, vj, vk
vmaddwev.h.bu.b \vd, \vj, \vk
vmaddwod.h.bu.b \vd, \vj, \vk
.endm
.macro vdp2add.w.h vd, vj, vk
vmaddwev.w.h \vd, \vj, \vk
vmaddwod.w.h \vd, \vj, \vk
.endm
.macro xvdp2add.h.bu.b xd, xj, xk
xvmaddwev.h.bu.b \xd, \xj, \xk
xvmaddwod.h.bu.b \xd, \xj, \xk
.endm
.macro xvdp2add.w.h xd, xj, xk
xvmaddwev.w.h \xd, \xj, \xk
xvmaddwod.w.h \xd, \xj, \xk
.endm
/*
* Description : Range element vj[i] to vk[i] ~ vj[i]
* clip: vj > vk ? vj : vk && vj < va ? vj : va
*/
.macro vclip.h vd, vj, vk, va
vmax.h \vd, \vj, \vk
vmin.h \vd, \vd, \va
.endm
.macro vclip.w vd, vj, vk, va
vmax.w \vd, \vj, \vk
vmin.w \vd, \vd, \va
.endm
.macro xvclip.h xd, xj, xk, xa
xvmax.h \xd, \xj, \xk
xvmin.h \xd, \xd, \xa
.endm
.macro xvclip.w xd, xj, xk, xa
xvmax.w \xd, \xj, \xk
xvmin.w \xd, \xd, \xa
.endm
/*
* Description : Range element vj[i] to 0 ~ 255
* clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
*/
.macro vclip255.h vd, vj
vmaxi.h \vd, \vj, 0
vsat.hu \vd, \vd, 7
.endm
.macro vclip255.w vd, vj
vmaxi.w \vd, \vj, 0
vsat.wu \vd, \vd, 7
.endm
.macro xvclip255.h xd, xj
xvmaxi.h \xd, \xj, 0
xvsat.hu \xd, \xd, 7
.endm
.macro xvclip255.w xd, xj
xvmaxi.w \xd, \xj, 0
xvsat.wu \xd, \xd, 7
.endm
/*
* Description : Store elements of vector
* vd : Data vector to be stroed
* rk : Address of data storage
* ra : Offset of address
* si : Index of data in vd
*/
.macro vstelmx.b vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.b \vd, \rk, 0, \si
.endm
.macro vstelmx.h vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.h \vd, \rk, 0, \si
.endm
.macro vstelmx.w vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.w \vd, \rk, 0, \si
.endm
.macro vstelmx.d vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.d \vd, \rk, 0, \si
.endm
.macro vmov xd, xj
vor.v \xd, \xj, \xj
.endm
.macro xmov xd, xj
xvor.v \xd, \xj, \xj
.endm
.macro xvstelmx.d xd, rk, ra, si
add.d \rk, \rk, \ra
xvstelm.d \xd, \rk, 0, \si
.endm
/*
*============================================================================
* LSX/LASX custom macros
*============================================================================
*/
/*
* Load 4 float, double, V128, v256 elements with stride.
*/
.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
fld.s \out0, \src, 0
fldx.s \out1, \src, \stride
fldx.s \out2, \src, \stride2
fldx.s \out3, \src, \stride3
.endm
.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
fld.d \out0, \src, 0
fldx.d \out1, \src, \stride
fldx.d \out2, \src, \stride2
fldx.d \out3, \src, \stride3
.endm
.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
vld \out0, \src, 0
vldx \out1, \src, \stride
vldx \out2, \src, \stride2
vldx \out3, \src, \stride3
.endm
.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
xvld \out0, \src, 0
xvldx \out1, \src, \stride
xvldx \out2, \src, \stride2
xvldx \out3, \src, \stride3
.endm
/*
* Description : Transpose 4x4 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
vilvl.h \tmp0, \in1, \in0
vilvl.h \tmp1, \in3, \in2
vilvl.w \out0, \tmp1, \tmp0
vilvh.w \out2, \tmp1, \tmp0
vilvh.d \out1, \out0, \out0
vilvh.d \out3, \out0, \out2
.endm
/*
* Description : Transpose 4x4 block with word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
* Details :
* Example :
* 1, 2, 3, 4 1, 5, 9,13
* 5, 6, 7, 8 to 2, 6,10,14
* 9,10,11,12 =====> 3, 7,11,15
* 13,14,15,16 4, 8,12,16
*/
.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
vilvl.w \tmp0, \in1, \in0
vilvh.w \out1, \in1, \in0
vilvl.w \tmp1, \in3, \in2
vilvh.w \out3, \in3, \in2
vilvl.d \out0, \tmp1, \tmp0
vilvl.d \out2, \out3, \out1
vilvh.d \out3, \out3, \out1
vilvh.d \out1, \tmp1, \tmp0
.endm
/*
* Description : Transpose 8x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
*/
.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
tmp3, tmp4, tmp5, tmp6, tmp7
vilvl.h \tmp0, \in6, \in4
vilvl.h \tmp1, \in7, \in5
vilvl.h \tmp2, \in2, \in0
vilvl.h \tmp3, \in3, \in1
vilvl.h \tmp4, \tmp1, \tmp0
vilvh.h \tmp5, \tmp1, \tmp0
vilvl.h \tmp6, \tmp3, \tmp2
vilvh.h \tmp7, \tmp3, \tmp2
vilvh.h \tmp0, \in6, \in4
vilvh.h \tmp1, \in7, \in5
vilvh.h \tmp2, \in2, \in0
vilvh.h \tmp3, \in3, \in1
vpickev.d \out0, \tmp4, \tmp6
vpickod.d \out1, \tmp4, \tmp6
vpickev.d \out2, \tmp5, \tmp7
vpickod.d \out3, \tmp5, \tmp7
vilvl.h \tmp4, \tmp1, \tmp0
vilvh.h \tmp5, \tmp1, \tmp0
vilvl.h \tmp6, \tmp3, \tmp2
vilvh.h \tmp7, \tmp3, \tmp2
vpickev.d \out4, \tmp4, \tmp6
vpickod.d \out5, \tmp4, \tmp6
vpickev.d \out6, \tmp5, \tmp7
vpickod.d \out7, \tmp5, \tmp7
.endm
/*
* Description : Transpose 16x8 block with byte elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
*/
.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3, out4, out5, out6, out7,\
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
xvilvl.b \tmp0, \in2, \in0
xvilvl.b \tmp1, \in3, \in1
xvilvl.b \tmp2, \in6, \in4
xvilvl.b \tmp3, \in7, \in5
xvilvl.b \tmp4, \in10, \in8
xvilvl.b \tmp5, \in11, \in9
xvilvl.b \tmp6, \in14, \in12
xvilvl.b \tmp7, \in15, \in13
xvilvl.b \out0, \tmp1, \tmp0
xvilvh.b \out1, \tmp1, \tmp0
xvilvl.b \out2, \tmp3, \tmp2
xvilvh.b \out3, \tmp3, \tmp2
xvilvl.b \out4, \tmp5, \tmp4
xvilvh.b \out5, \tmp5, \tmp4
xvilvl.b \out6, \tmp7, \tmp6
xvilvh.b \out7, \tmp7, \tmp6
xvilvl.w \tmp0, \out2, \out0
xvilvh.w \tmp2, \out2, \out0
xvilvl.w \tmp4, \out3, \out1
xvilvh.w \tmp6, \out3, \out1
xvilvl.w \tmp1, \out6, \out4
xvilvh.w \tmp3, \out6, \out4
xvilvl.w \tmp5, \out7, \out5
xvilvh.w \tmp7, \out7, \out5
xvilvl.d \out0, \tmp1, \tmp0
xvilvh.d \out1, \tmp1, \tmp0
xvilvl.d \out2, \tmp3, \tmp2
xvilvh.d \out3, \tmp3, \tmp2
xvilvl.d \out4, \tmp5, \tmp4
xvilvh.d \out5, \tmp5, \tmp4
xvilvl.d \out6, \tmp7, \tmp6
xvilvh.d \out7, \tmp7, \tmp6
.endm
/*
* Description : Transpose 4x4 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.h \tmp0, \in1, \in0
xvilvl.h \tmp1, \in3, \in2
xvilvl.w \out0, \tmp1, \tmp0
xvilvh.w \out2, \tmp1, \tmp0
xvilvh.d \out1, \out0, \out0
xvilvh.d \out3, \out0, \out2
.endm
/*
* Description : Transpose 4x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.h \tmp0, \in2, \in0
xvilvl.h \tmp1, \in3, \in1
xvilvl.h \out2, \tmp1, \tmp0
xvilvh.h \out3, \tmp1, \tmp0
xvilvl.d \out0, \out2, \out2
xvilvh.d \out1, \out2, \out2
xvilvl.d \out2, \out3, \out3
xvilvh.d \out3, \out3, \out3
.endm
/*
* Description : Transpose 8x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
*/
.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7, \
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
xvilvl.h \tmp0, \in6, \in4
xvilvl.h \tmp1, \in7, \in5
xvilvl.h \tmp2, \in2, \in0
xvilvl.h \tmp3, \in3, \in1
xvilvl.h \tmp4, \tmp1, \tmp0
xvilvh.h \tmp5, \tmp1, \tmp0
xvilvl.h \tmp6, \tmp3, \tmp2
xvilvh.h \tmp7, \tmp3, \tmp2
xvilvh.h \tmp0, \in6, \in4
xvilvh.h \tmp1, \in7, \in5
xvilvh.h \tmp2, \in2, \in0
xvilvh.h \tmp3, \in3, \in1
xvpickev.d \out0, \tmp4, \tmp6
xvpickod.d \out1, \tmp4, \tmp6
xvpickev.d \out2, \tmp5, \tmp7
xvpickod.d \out3, \tmp5, \tmp7
xvilvl.h \tmp4, \tmp1, \tmp0
xvilvh.h \tmp5, \tmp1, \tmp0
xvilvl.h \tmp6, \tmp3, \tmp2
xvilvh.h \tmp7, \tmp3, \tmp2
xvpickev.d \out4, \tmp4, \tmp6
xvpickod.d \out5, \tmp4, \tmp6
xvpickev.d \out6, \tmp5, \tmp7
xvpickod.d \out7, \tmp5, \tmp7
.endm
/*
* Description : Transpose 2x4x4 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1, tmp2
xvilvh.h \tmp1, \in0, \in1
xvilvl.h \out1, \in0, \in1
xvilvh.h \tmp0, \in2, \in3
xvilvl.h \out3, \in2, \in3
xvilvh.w \tmp2, \out3, \out1
xvilvl.w \out3, \out3, \out1
xvilvl.w \out2, \tmp0, \tmp1
xvilvh.w \tmp1, \tmp0, \tmp1
xvilvh.d \out0, \out2, \out3
xvilvl.d \out2, \out2, \out3
xvilvh.d \out1, \tmp1, \tmp2
xvilvl.d \out3, \tmp1, \tmp2
.endm
/*
* Description : Transpose 4x4 block with word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
* Details :
* Example :
* 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
* 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
* 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
* 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
*/
.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.w \tmp0, \in1, \in0
xvilvh.w \out1, \in1, \in0
xvilvl.w \tmp1, \in3, \in2
xvilvh.w \out3, \in3, \in2
xvilvl.d \out0, \tmp1, \tmp0
xvilvl.d \out2, \out3, \out1
xvilvh.d \out3, \out3, \out1
xvilvh.d \out1, \tmp1, \tmp0
.endm
/*
* Description : Transpose 8x8 block with word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6,
* _out7
* Example : LASX_TRANSPOSE8x8_W
* in0 : 1,2,3,4,5,6,7,8
* in1 : 2,2,3,4,5,6,7,8
* in2 : 3,2,3,4,5,6,7,8
* in3 : 4,2,3,4,5,6,7,8
* in4 : 5,2,3,4,5,6,7,8
* in5 : 6,2,3,4,5,6,7,8
* in6 : 7,2,3,4,5,6,7,8
* in7 : 8,2,3,4,5,6,7,8
*
* out0 : 1,2,3,4,5,6,7,8
* out1 : 2,2,2,2,2,2,2,2
* out2 : 3,3,3,3,3,3,3,3
* out3 : 4,4,4,4,4,4,4,4
* out4 : 5,5,5,5,5,5,5,5
* out5 : 6,6,6,6,6,6,6,6
* out6 : 7,7,7,7,7,7,7,7
* out7 : 8,8,8,8,8,8,8,8
*/
.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
out0, out1, out2, out3, out4, out5, out6, out7,\
tmp0, tmp1, tmp2, tmp3
xvilvl.w \tmp0, \in2, \in0
xvilvl.w \tmp1, \in3, \in1
xvilvh.w \tmp2, \in2, \in0
xvilvh.w \tmp3, \in3, \in1
xvilvl.w \out0, \tmp1, \tmp0
xvilvh.w \out1, \tmp1, \tmp0
xvilvl.w \out2, \tmp3, \tmp2
xvilvh.w \out3, \tmp3, \tmp2
xvilvl.w \tmp0, \in6, \in4
xvilvl.w \tmp1, \in7, \in5
xvilvh.w \tmp2, \in6, \in4
xvilvh.w \tmp3, \in7, \in5
xvilvl.w \out4, \tmp1, \tmp0
xvilvh.w \out5, \tmp1, \tmp0
xvilvl.w \out6, \tmp3, \tmp2
xvilvh.w \out7, \tmp3, \tmp2
xmov \tmp0, \out0
xmov \tmp1, \out1
xmov \tmp2, \out2
xmov \tmp3, \out3
xvpermi.q \out0, \out4, 0x02
xvpermi.q \out1, \out5, 0x02
xvpermi.q \out2, \out6, 0x02
xvpermi.q \out3, \out7, 0x02
xvpermi.q \out4, \tmp0, 0x31
xvpermi.q \out5, \tmp1, 0x31
xvpermi.q \out6, \tmp2, 0x31
xvpermi.q \out7, \tmp3, 0x31
.endm
/*
* Description : Transpose 4x4 block with double-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
* Example : LASX_TRANSPOSE4x4_D
* in0 : 1,2,3,4
* in1 : 1,2,3,4
* in2 : 1,2,3,4
* in3 : 1,2,3,4
*
* out0 : 1,1,1,1
* out1 : 2,2,2,2
* out2 : 3,3,3,3
* out3 : 4,4,4,4
*/
.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.d \tmp0, \in1, \in0
xvilvh.d \out1, \in1, \in0
xvilvh.d \tmp1, \in3, \in2
xvilvl.d \out2, \in3, \in2
xvor.v \out0, \tmp0, \tmp0
xvor.v \out3, \tmp1, \tmp1
xvpermi.q \out0, \out2, 0x02
xvpermi.q \out2, \tmp0, 0x31
xvpermi.q \out3, \out1, 0x31
xvpermi.q \out1, \tmp1, 0x02
.endm

View File

@@ -0,0 +1,47 @@
/*****************************************************************************
* loongson_util.S: loongson utility macros
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
* Xiwei Gu <guxiwei-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
/* Set prefix as needed. */
#define ASM_REF JOIN(JOIN(x264_, BIT_DEPTH), _)
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
.macro function_x264 name, align=DEFAULT_ALIGN
.macro endfunc_x264
jirl $r0, $r1, 0x0
.size ASM_REF\name, . - ASM_REF\name
.purgem endfunc_x264
.endm
.text ;
.align \align ;
.globl ASM_REF\name ;
.type ASM_REF\name, @function ;
ASM_REF\name: ;
.endm

2702
common/loongarch/mc-a.S Normal file

File diff suppressed because it is too large Load Diff

406
common/loongarch/mc-c.c Normal file
View File

@@ -0,0 +1,406 @@
/*****************************************************************************
* mc-c.c: loongarch motion compensation
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "mc.h"
#if !HIGH_BIT_DEPTH
#define MC_WEIGHT_LSX(func) \
static void (* mc##func##_wtab_lsx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \
{ \
x264_mc_weight_w4##func##_lsx, \
x264_mc_weight_w4##func##_lsx, \
x264_mc_weight_w8##func##_lsx, \
x264_mc_weight_w16##func##_lsx, \
x264_mc_weight_w16##func##_lsx, \
x264_mc_weight_w20##func##_lsx, \
};
#define MC_WEIGHT(func) \
static void (* mc##func##_wtab_lasx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \
{ \
x264_mc_weight_w4##func##_lasx, \
x264_mc_weight_w4##func##_lasx, \
x264_mc_weight_w8##func##_lasx, \
x264_mc_weight_w16##func##_lasx, \
x264_mc_weight_w16##func##_lasx, \
x264_mc_weight_w20##func##_lasx, \
};
#if !HIGH_BIT_DEPTH
MC_WEIGHT_LSX()
MC_WEIGHT_LSX(_noden)
MC_WEIGHT()
MC_WEIGHT(_noden)
#endif
static void weight_cache_lsx( x264_t *h, x264_weight_t *w )
{
if ( w->i_denom >= 1)
{
w->weightfn = mc_wtab_lsx;
}
else
w->weightfn = mc_noden_wtab_lsx;
}
static weight_fn_t mc_weight_wtab_lsx[6] =
{
x264_mc_weight_w4_lsx,
x264_mc_weight_w4_lsx,
x264_mc_weight_w8_lsx,
x264_mc_weight_w16_lsx,
x264_mc_weight_w16_lsx,
x264_mc_weight_w20_lsx,
};
static void (* const pixel_avg_wtab_lsx[6])(uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_lsx,
x264_pixel_avg2_w8_lsx,
x264_pixel_avg2_w16_lsx,
x264_pixel_avg2_w16_lsx,
x264_pixel_avg2_w20_lsx,
};
static void (* const mc_copy_wtab_lsx[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_lsx,
x264_mc_copy_w8_lsx,
NULL,
x264_mc_copy_w16_lsx,
};
static void weight_cache_lasx( x264_t *h, x264_weight_t *w )
{
if ( w->i_denom >= 1)
{
w->weightfn = mc_wtab_lasx;
}
else
w->weightfn = mc_noden_wtab_lasx;
}
static weight_fn_t mc_weight_wtab_lasx[6] =
{
x264_mc_weight_w4_lasx,
x264_mc_weight_w4_lasx,
x264_mc_weight_w8_lasx,
x264_mc_weight_w16_lasx,
x264_mc_weight_w16_lasx,
x264_mc_weight_w20_lasx,
};
static void (* const pixel_avg_wtab_lasx[6])(uint8_t *, intptr_t, uint8_t *,
intptr_t, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_lasx,
x264_pixel_avg2_w8_lasx,
x264_pixel_avg2_w16_lasx,
x264_pixel_avg2_w16_lasx,
x264_pixel_avg2_w20_lasx,
};
static void (* const mc_copy_wtab_lasx[5])( uint8_t *, intptr_t, uint8_t *,
intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_lasx,
x264_mc_copy_w8_lasx,
NULL,
x264_mc_copy_w16_lasx,
};
static uint8_t *get_ref_lsx( uint8_t *p_dst, intptr_t *p_dst_stride,
uint8_t *p_src[4], intptr_t i_src_stride,
int32_t m_vx, int32_t m_vy,
int32_t i_width, int32_t i_height,
const x264_weight_t *pWeight )
{
int32_t i_qpel_idx;
int32_t i_offset;
uint8_t *p_src1;
int32_t r_vy = m_vy & 3;
int32_t r_vx = m_vx & 3;
int32_t width = i_width >> 2;
i_qpel_idx = ( r_vy << 2 ) + r_vx;
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
( 3 == r_vy ) * i_src_stride;
if( i_qpel_idx & 5 )
{
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
i_offset + ( 3 == r_vx );
pixel_avg_wtab_lsx[width](
p_dst, *p_dst_stride, p_src1, i_src_stride,
p_src2, i_height );
if( pWeight->weightfn )
{
pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height);
}
return p_dst;
}
else if ( pWeight->weightfn )
{
pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height );
return p_dst;
}
else
{
*p_dst_stride = i_src_stride;
return p_src1;
}
}
static void mc_luma_lsx( uint8_t *p_dst, intptr_t i_dst_stride,
uint8_t *p_src[4], intptr_t i_src_stride,
int32_t m_vx, int32_t m_vy,
int32_t i_width, int32_t i_height,
const x264_weight_t *pWeight )
{
int32_t i_qpel_idx;
int32_t i_offset;
uint8_t *p_src1;
i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
( 3 == ( m_vy & 3 ) ) * i_src_stride;
if( i_qpel_idx & 5 )
{
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
i_offset + ( 3 == ( m_vx & 3 ) );
pixel_avg_wtab_lsx[i_width >> 2](
p_dst, i_dst_stride, p_src1, i_src_stride,
p_src2, i_height );
if( pWeight->weightfn )
{
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height );
}
}
else if( pWeight->weightfn )
{
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height );
}
else
{
mc_copy_wtab_lsx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height );
}
}
PLANE_INTERLEAVE(lsx)
PLANE_COPY_YUYV(32, lsx)
#define x264_mc_chroma_lsx x264_template(mc_chroma_lsx)
void x264_mc_chroma_lsx( uint8_t *p_dst_u, uint8_t *p_dst_v,
intptr_t i_dst_stride,
uint8_t *p_src, intptr_t i_src_stride,
int32_t m_vx, int32_t m_vy,
int32_t i_width, int32_t i_height );
static uint8_t *get_ref_lasx( uint8_t *p_dst, intptr_t *p_dst_stride,
uint8_t *p_src[4], intptr_t i_src_stride,
int32_t m_vx, int32_t m_vy,
int32_t i_width, int32_t i_height,
const x264_weight_t *pWeight )
{
int32_t i_qpel_idx;
int32_t i_offset;
uint8_t *p_src1;
int32_t r_vy = m_vy & 3;
int32_t r_vx = m_vx & 3;
int32_t width = i_width >> 2;
i_qpel_idx = ( r_vy << 2 ) + r_vx;
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
( 3 == r_vy ) * i_src_stride;
if( i_qpel_idx & 5 )
{
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
i_offset + ( 3 == r_vx );
pixel_avg_wtab_lasx[width](
p_dst, *p_dst_stride, p_src1, i_src_stride,
p_src2, i_height );
if( pWeight->weightfn )
{
pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height);
}
return p_dst;
}
else if ( pWeight->weightfn )
{
pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height );
return p_dst;
}
else
{
*p_dst_stride = i_src_stride;
return p_src1;
}
}
static void mc_luma_lasx( uint8_t *p_dst, intptr_t i_dst_stride,
uint8_t *p_src[4], intptr_t i_src_stride,
int32_t m_vx, int32_t m_vy,
int32_t i_width, int32_t i_height,
const x264_weight_t *pWeight )
{
int32_t i_qpel_idx;
int32_t i_offset;
uint8_t *p_src1;
i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
( 3 == ( m_vy & 3 ) ) * i_src_stride;
if( i_qpel_idx & 5 )
{
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
i_offset + ( 3 == ( m_vx & 3 ) );
pixel_avg_wtab_lasx[i_width >> 2](
p_dst, i_dst_stride, p_src1, i_src_stride,
p_src2, i_height );
if( pWeight->weightfn )
{
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height );
}
}
else if( pWeight->weightfn )
{
pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height );
}
else
{
mc_copy_wtab_lasx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height );
}
}
PLANE_COPY_YUYV(64, lasx)
#define x264_mc_chroma_lasx x264_template(mc_chroma_lasx)
void x264_mc_chroma_lasx( uint8_t *p_dst_u, uint8_t *p_dst_v,
intptr_t i_dst_stride,
uint8_t *p_src, intptr_t i_src_stride,
int32_t m_vx, int32_t m_vy,
int32_t i_width, int32_t i_height );
#endif // !HIGH_BIT_DEPTH
void x264_mc_init_loongarch( int32_t cpu, x264_mc_functions_t *pf )
{
#if !HIGH_BIT_DEPTH
if( cpu & X264_CPU_LSX )
{
pf->mc_luma = mc_luma_lsx;
pf->mc_chroma = x264_mc_chroma_lsx;
pf->get_ref = get_ref_lsx;
pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_lsx;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lsx;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lsx;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lsx;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lsx;
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lsx;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lsx;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lsx;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lsx;
pf->weight = mc_weight_wtab_lsx;
pf->offsetadd = mc_weight_wtab_lsx;
pf->offsetsub = mc_weight_wtab_lsx;
pf->weight_cache = weight_cache_lsx;
pf->copy_16x16_unaligned = x264_mc_copy_w16_lsx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lsx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lsx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lsx;
pf->store_interleave_chroma = x264_store_interleave_chroma_lsx;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_lsx;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_lsx;
pf->plane_copy_interleave = plane_copy_interleave_lsx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lsx;
pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lsx;
pf->hpel_filter = x264_hpel_filter_lsx;
pf->memcpy_aligned = x264_memcpy_aligned_lsx;
pf->memzero_aligned = x264_memzero_aligned_lsx;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_lsx;
pf->prefetch_fenc_420 = x264_prefetch_fenc_420_lsx;
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_lsx;
pf->prefetch_ref = x264_prefetch_ref_lsx;
}
if( cpu & X264_CPU_LASX )
{
pf->mc_luma = mc_luma_lasx;
pf->mc_chroma = x264_mc_chroma_lasx;
pf->get_ref = get_ref_lasx;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lasx;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lasx;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lasx;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lasx;
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lasx;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lasx;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lasx;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lasx;
pf->weight = mc_weight_wtab_lasx;
pf->offsetadd = mc_weight_wtab_lasx;
pf->offsetsub = mc_weight_wtab_lasx;
pf->weight_cache = weight_cache_lasx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lasx;
pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lasx;
pf->copy_16x16_unaligned = x264_mc_copy_w16_lasx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lasx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lasx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lasx;
pf->hpel_filter = x264_hpel_filter_lasx;
pf->memzero_aligned = x264_memzero_aligned_lasx;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_lasx;
}
#endif // !HIGH_BIT_DEPTH
}

196
common/loongarch/mc.h Normal file
View File

@@ -0,0 +1,196 @@
/*****************************************************************************
* mc.h: loongarch motion compensation
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_LOONGARCH_MC_H
#define X264_LOONGARCH_MC_H
#define x264_mc_init_loongarch x264_template(mc_init_loongarch)
void x264_mc_init_loongarch( int cpu, x264_mc_functions_t *pf );
#define x264_pixel_avg_16x16_lsx x264_template(pixel_avg_16x16_lsx)
void x264_pixel_avg_16x16_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_16x8_lsx x264_template(pixel_avg_16x8_lsx)
void x264_pixel_avg_16x8_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x16_lsx x264_template(pixel_avg_8x16_lsx)
void x264_pixel_avg_8x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x8_lsx x264_template(pixel_avg_8x8_lsx)
void x264_pixel_avg_8x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x4_lsx x264_template(pixel_avg_8x4_lsx)
void x264_pixel_avg_8x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x16_lsx x264_template(pixel_avg_4x16_lsx)
void x264_pixel_avg_4x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x8_lsx x264_template(pixel_avg_4x8_lsx)
void x264_pixel_avg_4x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x4_lsx x264_template(pixel_avg_4x4_lsx)
void x264_pixel_avg_4x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x2_lsx x264_template(pixel_avg_4x2_lsx)
void x264_pixel_avg_4x2_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg2_w4_lsx x264_template(pixel_avg2_w4_lsx)
void x264_pixel_avg2_w4_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w8_lsx x264_template(pixel_avg2_w8_lsx)
void x264_pixel_avg2_w8_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w16_lsx x264_template(pixel_avg2_w16_lsx)
void x264_pixel_avg2_w16_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w20_lsx x264_template(pixel_avg2_w20_lsx)
void x264_pixel_avg2_w20_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_mc_weight_w20_lsx x264_template(mc_weight_w20_lsx)
void x264_mc_weight_w20_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w20_noden_lsx x264_template(mc_weight_w20_noden_lsx)
void x264_mc_weight_w20_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w16_lsx x264_template(mc_weight_w16_lsx)
void x264_mc_weight_w16_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w16_noden_lsx x264_template(mc_weight_w16_noden_lsx)
void x264_mc_weight_w16_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w8_lsx x264_template(mc_weight_w8_lsx)
void x264_mc_weight_w8_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w8_noden_lsx x264_template(mc_weight_w8_noden_lsx)
void x264_mc_weight_w8_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w4_lsx x264_template(mc_weight_w4_lsx)
void x264_mc_weight_w4_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w4_noden_lsx x264_template(mc_weight_w4_noden_lsx)
void x264_mc_weight_w4_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_copy_w16_lsx x264_template(mc_copy_w16_lsx)
void x264_mc_copy_w16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w8_lsx x264_template(mc_copy_w8_lsx)
void x264_mc_copy_w8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w4_lsx x264_template(mc_copy_w4_lsx)
void x264_mc_copy_w4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_store_interleave_chroma_lsx x264_template(store_interleave_chroma_lsx)
void x264_store_interleave_chroma_lsx( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
#define x264_load_deinterleave_chroma_fenc_lsx x264_template(load_deinterleave_chroma_fenc_lsx)
void x264_load_deinterleave_chroma_fenc_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fdec_lsx x264_template(load_deinterleave_chroma_fdec_lsx)
void x264_load_deinterleave_chroma_fdec_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_plane_copy_interleave_core_lsx x264_template(plane_copy_interleave_core_lsx)
void x264_plane_copy_interleave_core_lsx( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define x264_plane_copy_deinterleave_lsx x264_template(plane_copy_deinterleave_lsx)
void x264_plane_copy_deinterleave_lsx( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_prefetch_fenc_420_lsx x264_template(prefetch_fenc_420_lsx)
void x264_prefetch_fenc_420_lsx( uint8_t *pix_y, intptr_t stride_y,
uint8_t *pix_uv, intptr_t stride_uv,
int32_t mb_x );
#define x264_prefetch_fenc_422_lsx x264_template(prefetch_fenc_422_lsx)
void x264_prefetch_fenc_422_lsx( uint8_t *pix_y, intptr_t stride_y,
uint8_t *pix_uv, intptr_t stride_uv,
int32_t mb_x );
#define x264_prefetch_ref_lsx x264_template(prefetch_ref_lsx)
void x264_prefetch_ref_lsx( uint8_t *pix, intptr_t stride, int32_t parity );
#define x264_memcpy_aligned_lsx x264_template(memcpy_aligned_lsx)
void *x264_memcpy_aligned_lsx( void *dst, const void *src, size_t n );
#define x264_memzero_aligned_lsx x264_template(memzero_aligned_lsx)
void x264_memzero_aligned_lsx( void *p_dst, size_t n );
#define x264_hpel_filter_lsx x264_template(hpel_filter_lsx)
void x264_hpel_filter_lsx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
#define x264_frame_init_lowres_core_lsx x264_template(frame_init_lowres_core_lsx)
void x264_frame_init_lowres_core_lsx( uint8_t *, uint8_t *, uint8_t *, uint8_t *,
uint8_t *, intptr_t, intptr_t, int, int );
#define x264_pixel_avg_16x8_lasx x264_template(pixel_avg_16x8_lasx)
void x264_pixel_avg_16x8_lasx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
#define x264_pixel_avg_8x16_lasx x264_template(pixel_avg_8x16_lasx)
void x264_pixel_avg_8x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x8_lasx x264_template(pixel_avg_8x8_lasx)
void x264_pixel_avg_8x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_8x4_lasx x264_template(pixel_avg_8x4_lasx)
void x264_pixel_avg_8x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x16_lasx x264_template(pixel_avg_4x16_lasx)
void x264_pixel_avg_4x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x8_lasx x264_template(pixel_avg_4x8_lasx)
void x264_pixel_avg_4x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x4_lasx x264_template(pixel_avg_4x4_lasx)
void x264_pixel_avg_4x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg_4x2_lasx x264_template(pixel_avg_4x2_lasx)
void x264_pixel_avg_4x2_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_pixel_avg2_w4_lasx x264_template(pixel_avg2_w4_lasx)
void x264_pixel_avg2_w4_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w8_lasx x264_template(pixel_avg2_w8_lasx)
void x264_pixel_avg2_w8_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w16_lasx x264_template(pixel_avg2_w16_lasx)
void x264_pixel_avg2_w16_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_pixel_avg2_w20_lasx x264_template(pixel_avg2_w20_lasx)
void x264_pixel_avg2_w20_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define x264_mc_weight_w20_lasx x264_template(mc_weight_w20_lasx)
void x264_mc_weight_w20_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w20_noden_lasx x264_template(mc_weight_w20_noden_lasx)
void x264_mc_weight_w20_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w16_lasx x264_template(mc_weight_w16_lasx)
void x264_mc_weight_w16_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w16_noden_lasx x264_template(mc_weight_w16_noden_lasx)
void x264_mc_weight_w16_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w8_lasx x264_template(mc_weight_w8_lasx)
void x264_mc_weight_w8_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w8_noden_lasx x264_template(mc_weight_w8_noden_lasx)
void x264_mc_weight_w8_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w4_lasx x264_template(mc_weight_w4_lasx)
void x264_mc_weight_w4_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_weight_w4_noden_lasx x264_template(mc_weight_w4_noden_lasx)
void x264_mc_weight_w4_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define x264_mc_copy_w16_lasx x264_template(mc_copy_w16_lasx)
void x264_mc_copy_w16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w8_lasx x264_template(mc_copy_w8_lasx)
void x264_mc_copy_w8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_mc_copy_w4_lasx x264_template(mc_copy_w4_lasx)
void x264_mc_copy_w4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#define x264_plane_copy_interleave_core_lasx x264_template(plane_copy_interleave_core_lasx)
void x264_plane_copy_interleave_core_lasx( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
#define x264_memzero_aligned_lasx x264_template(memzero_aligned_lasx)
void x264_memzero_aligned_lasx( void *p_dst, size_t n );
#define x264_hpel_filter_lasx x264_template(hpel_filter_lasx)
void x264_hpel_filter_lasx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
#define x264_frame_init_lowres_core_lasx x264_template(frame_init_lowres_core_lasx)
void x264_frame_init_lowres_core_lasx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *,
intptr_t, intptr_t, int, int );
#endif

3548
common/loongarch/pixel-a.S Normal file

File diff suppressed because it is too large Load Diff

259
common/loongarch/pixel-c.c Normal file
View File

@@ -0,0 +1,259 @@
/*****************************************************************************
* pixel-c.c: loongarch pixel metrics
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Hecai Yuan <yuanhecai@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "pixel.h"
#include "predict.h"
#if !HIGH_BIT_DEPTH
uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride )
{
uint64_t u_sum;
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
}
uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride )
{
uint64_t u_sum;
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride );
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
}
uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride )
{
uint64_t u_sum;
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride );
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
}
uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride )
{
uint64_t u_sum;
u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride );
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride );
u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride + 8, i_stride );
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
}
uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride )
{
uint64_t u_sum;
u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride );
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
}
uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride )
{
uint64_t u_sum;
u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride );
u_sum += x264_hadamard_ac_8x8_lasx( p_pix + ( i_stride << 3 ), i_stride );
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
}
void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] )
{
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
x264_predict_8x8_v_lsx( pix, p_edge );
p_sad_array[0] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8_h_lsx( pix, p_edge );
p_sad_array[1] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8_dc_lsx( pix, p_edge );
p_sad_array[2] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] )
{
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
x264_predict_8x8_v_lsx( pix, p_edge );
p_sad_array[0] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8_h_lasx( pix, p_edge );
p_sad_array[1] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8_dc_lsx( pix, p_edge );
p_sad_array[2] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] )
{
x264_predict_4x4_v_lsx( p_dec );
p_sad_array[0] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_4x4_h_lsx( p_dec );
p_sad_array[1] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_4x4_dc_lsx( p_dec );
p_sad_array[2] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] )
{
x264_predict_16x16_v_lsx( p_dec );
p_sad_array[0] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_16x16_h_lsx( p_dec );
p_sad_array[1] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_16x16_dc_lsx( p_dec );
p_sad_array[2] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] )
{
x264_predict_16x16_v_lsx( p_dec );
p_sad_array[0] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_16x16_h_lsx( p_dec );
p_sad_array[1] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_16x16_dc_lsx( p_dec );
p_sad_array[2] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] )
{
x264_predict_8x8c_dc_lsx( p_dec );
p_sad_array[0] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8c_h_lsx( p_dec );
p_sad_array[1] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8c_v_lsx( p_dec );
p_sad_array[2] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] )
{
x264_predict_4x4_v_lsx( p_dec );
p_sad_array[0] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_4x4_h_lsx( p_dec );
p_sad_array[1] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_4x4_dc_lsx( p_dec );
p_sad_array[2] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] )
{
x264_predict_16x16_v_lsx( p_dec );
p_sad_array[0] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_16x16_h_lsx( p_dec );
p_sad_array[1] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_16x16_dc_lsx( p_dec );
p_sad_array[2] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] )
{
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
x264_predict_8x8_v_lsx( pix, p_edge );
p_sad_array[0] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8_h_lsx( pix, p_edge );
p_sad_array[1] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8_dc_lsx( pix, p_edge );
p_sad_array[2] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] )
{
x264_predict_8x8c_dc_lsx( p_dec );
p_sad_array[0] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8c_h_lsx( p_dec );
p_sad_array[1] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
x264_predict_8x8c_v_lsx( p_dec );
p_sad_array[2] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
p_enc, FENC_STRIDE );
}
#endif

335
common/loongarch/pixel.h Normal file
View File

@@ -0,0 +1,335 @@
/*****************************************************************************
* pixel.h: loongarch pixel metrics
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Lu Wang <wanglu@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_LOONGARCH_PIXEL_H
#define X264_LOONGARCH_PIXEL_H
#define x264_pixel_satd_4x4_lsx x264_template(pixel_satd_4x4_lsx)
int32_t x264_pixel_satd_4x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x8_lsx x264_template(pixel_satd_4x8_lsx)
int32_t x264_pixel_satd_4x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x16_lsx x264_template(pixel_satd_4x16_lsx)
int32_t x264_pixel_satd_4x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x4_lsx x264_template(pixel_satd_8x4_lsx)
int32_t x264_pixel_satd_8x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x8_lsx x264_template(pixel_satd_8x8_lsx)
int32_t x264_pixel_satd_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x16_lsx x264_template(pixel_satd_8x16_lsx)
int32_t x264_pixel_satd_8x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x8_lsx x264_template(pixel_satd_16x8_lsx)
int32_t x264_pixel_satd_16x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x16_lsx x264_template(pixel_satd_16x16_lsx)
int32_t x264_pixel_satd_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x8_lasx x264_template(pixel_satd_4x8_lasx)
int32_t x264_pixel_satd_4x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x16_lasx x264_template(pixel_satd_4x16_lasx)
int32_t x264_pixel_satd_4x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x4_lasx x264_template(pixel_satd_8x4_lasx)
int32_t x264_pixel_satd_8x4_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x8_lasx x264_template(pixel_satd_8x8_lasx)
int32_t x264_pixel_satd_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x16_lasx x264_template(pixel_satd_8x16_lasx)
int32_t x264_pixel_satd_8x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x8_lasx x264_template(pixel_satd_16x8_lasx)
int32_t x264_pixel_satd_16x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x16_lasx x264_template(pixel_satd_16x16_lasx)
int32_t x264_pixel_satd_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sad_x4_16x16_lsx x264_template(pixel_sad_x4_16x16_lsx)
void x264_pixel_sad_x4_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_16x8_lsx x264_template(pixel_sad_x4_16x8_lsx)
void x264_pixel_sad_x4_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x16_lsx x264_template(pixel_sad_x4_8x16_lsx)
void x264_pixel_sad_x4_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x8_lsx x264_template(pixel_sad_x4_8x8_lsx)
void x264_pixel_sad_x4_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x4_lsx x264_template(pixel_sad_x4_8x4_lsx)
void x264_pixel_sad_x4_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_4x8_lsx x264_template(pixel_sad_x4_4x8_lsx)
void x264_pixel_sad_x4_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_16x16_lasx x264_template(pixel_sad_x4_16x16_lasx)
void x264_pixel_sad_x4_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_16x8_lasx x264_template(pixel_sad_x4_16x8_lasx)
void x264_pixel_sad_x4_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x8_lasx x264_template(pixel_sad_x4_8x8_lasx)
void x264_pixel_sad_x4_8x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x4_lasx x264_template(pixel_sad_x4_8x4_lasx)
void x264_pixel_sad_x4_8x4_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_4x4_lsx x264_template(pixel_sad_x4_4x4_lsx)
void x264_pixel_sad_x4_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x3_16x16_lsx x264_template(pixel_sad_x3_16x16_lsx)
void x264_pixel_sad_x3_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_16x8_lsx x264_template(pixel_sad_x3_16x8_lsx)
void x264_pixel_sad_x3_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x16_lsx x264_template(pixel_sad_x3_8x16_lsx)
void x264_pixel_sad_x3_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x8_lsx x264_template(pixel_sad_x3_8x8_lsx)
void x264_pixel_sad_x3_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x4_lsx x264_template(pixel_sad_x3_8x4_lsx)
void x264_pixel_sad_x3_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_4x4_lsx x264_template(pixel_sad_x3_4x4_lsx)
void x264_pixel_sad_x3_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_4x8_lsx x264_template(pixel_sad_x3_4x8_lsx)
void x264_pixel_sad_x3_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_16x16_lasx x264_template(pixel_sad_x3_16x16_lasx)
void x264_pixel_sad_x3_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_16x8_lasx x264_template(pixel_sad_x3_16x8_lasx)
void x264_pixel_sad_x3_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_16x16_lsx x264_template(pixel_sad_16x16_lsx)
int32_t x264_pixel_sad_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_16x8_lsx x264_template(pixel_sad_16x8_lsx)
int32_t x264_pixel_sad_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x16_lsx x264_template(pixel_sad_8x16_lsx)
int32_t x264_pixel_sad_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x8_lsx x264_template(pixel_sad_8x8_lsx)
int32_t x264_pixel_sad_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x4_lsx x264_template(pixel_sad_8x4_lsx)
int32_t x264_pixel_sad_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x16_lsx x264_template(pixel_sad_4x16_lsx)
int32_t x264_pixel_sad_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x8_lsx x264_template(pixel_sad_4x8_lsx)
int32_t x264_pixel_sad_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x4_lsx x264_template(pixel_sad_4x4_lsx)
int32_t x264_pixel_sad_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x4_lasx x264_template(pixel_sad_8x4_lasx)
int32_t x264_pixel_sad_8x4_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_hadamard_ac_8x8_lsx x264_template(hadamard_ac_8x8_lsx)
uint64_t x264_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x8_lsx x264_template(pixel_hadamard_ac_8x8_lsx)
uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x16_lsx x264_template(pixel_hadamard_ac_8x16_lsx)
uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x8_lsx x264_template(pixel_hadamard_ac_16x8_lsx)
uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x16_lsx x264_template(pixel_hadamard_ac_16x16_lsx)
uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_hadamard_ac_8x8_lasx x264_template(hadamard_ac_8x8_lasx)
uint64_t x264_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x8_lasx x264_template(pixel_hadamard_ac_8x8_lasx)
uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x16_lasx x264_template(pixel_hadamard_ac_8x16_lasx)
uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x8_lasx x264_template(pixel_hadamard_ac_16x8_lasx)
uint64_t x264_pixel_hadamard_ac_16x8_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x16_lasx x264_template(pixel_hadamard_ac_16x16_lasx)
uint64_t x264_pixel_hadamard_ac_16x16_lasx( uint8_t *p_pix, intptr_t i_stride );
#define x264_intra_satd_x3_16x16_lsx x264_template(intra_satd_x3_16x16_lsx)
void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_8x8c_lsx x264_template(intra_satd_x3_8x8c_lsx)
void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_4x4_lsx x264_template(intra_satd_x3_4x4_lsx)
void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_16x16_lasx x264_template(intra_satd_x3_16x16_lasx)
void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_pixel_ssd_16x16_lsx x264_template(pixel_ssd_16x16_lsx)
int32_t x264_pixel_ssd_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_16x8_lsx x264_template(pixel_ssd_16x8_lsx)
int32_t x264_pixel_ssd_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x16_lsx x264_template(pixel_ssd_8x16_lsx)
int32_t x264_pixel_ssd_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x8_lsx x264_template(pixel_ssd_8x8_lsx)
int32_t x264_pixel_ssd_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x4_lsx x264_template(pixel_ssd_8x4_lsx)
int32_t x264_pixel_ssd_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x16_lsx x264_template(pixel_ssd_4x16_lsx)
int32_t x264_pixel_ssd_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x8_lsx x264_template(pixel_ssd_4x8_lsx)
int32_t x264_pixel_ssd_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x4_lsx x264_template(pixel_ssd_4x4_lsx)
int32_t x264_pixel_ssd_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_16x16_lasx x264_template(pixel_ssd_16x16_lasx)
int32_t x264_pixel_ssd_16x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_16x8_lasx x264_template(pixel_ssd_16x8_lasx)
int32_t x264_pixel_ssd_16x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x16_lasx x264_template(pixel_ssd_8x16_lasx)
int32_t x264_pixel_ssd_8x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x8_lasx x264_template(pixel_ssd_8x8_lasx)
int32_t x264_pixel_ssd_8x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_var2_8x16_lsx x264_template(pixel_var2_8x16_lsx)
int32_t x264_pixel_var2_8x16_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_var2_8x8_lsx x264_template(pixel_var2_8x8_lsx)
int32_t x264_pixel_var2_8x8_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_var_16x16_lsx x264_template(pixel_var_16x16_lsx)
uint64_t x264_pixel_var_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var_8x16_lsx x264_template(pixel_var_8x16_lsx)
uint64_t x264_pixel_var_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var_8x8_lsx x264_template(pixel_var_8x8_lsx)
uint64_t x264_pixel_var_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var2_8x16_lasx x264_template(pixel_var2_8x16_lasx)
int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_var2_8x8_lasx x264_template(pixel_var2_8x8_lasx)
int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
int32_t ssd[2] );
#define x264_pixel_sa8d_8x8_lsx x264_template(pixel_sa8d_8x8_lsx)
int32_t x264_pixel_sa8d_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sa8d_16x16_lsx x264_template(pixel_sa8d_16x16_lsx)
int32_t x264_pixel_sa8d_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_intra_sa8d_x3_8x8_lsx x264_template(intra_sa8d_x3_8x8_lsx)
void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_intra_sa8d_x3_8x8_lasx x264_template(intra_sa8d_x3_8x8_lasx)
void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_pixel_sa8d_8x8_lasx x264_template(pixel_sa8d_8x8_lasx)
int32_t x264_pixel_sa8d_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sa8d_16x16_lasx x264_template(pixel_sa8d_16x16_lasx)
int32_t x264_pixel_sa8d_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_intra_sad_x3_16x16_lsx x264_template(intra_sad_x3_16x16_lsx)
void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_8x8_lsx x264_template(intra_sad_x3_8x8_lsx)
void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_8x8c_lsx x264_template(intra_sad_x3_8x8c_lsx)
void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_4x4_lsx x264_template(intra_sad_x3_4x4_lsx)
void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#endif

1383
common/loongarch/predict-a.S Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,106 @@
/*****************************************************************************
* predict-c.c: loongarch intra prediction
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "predict.h"
void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_LSX )
{
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_lsx;
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_lsx;
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_lsx;
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_lsx;
pf[I_PRED_16x16_DC_TOP ]= x264_predict_16x16_dc_top_lsx;
pf[I_PRED_16x16_DC_128 ]= x264_predict_16x16_dc_128_lsx;
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lsx;
}
if( cpu&X264_CPU_LASX )
{
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lasx;
}
#endif
}
void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_LSX )
{
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_lsx;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_lsx;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_lsx;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_lsx;
pf[I_PRED_CHROMA_DC_128] = x264_predict_8x8c_dc_128_lsx;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_lsx;
pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x8c_dc_left_lsx;
}
#endif
}
void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_LSX )
{
pf[I_PRED_8x8_V] = x264_predict_8x8_v_lsx;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_lsx;
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_lsx;
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_lsx;
pf[I_PRED_8x8_DC_128] = x264_predict_8x8_dc_128_lsx;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_lsx;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lsx;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lsx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lsx;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lsx;
}
if( cpu&X264_CPU_LASX )
{
pf[I_PRED_8x8_H] = x264_predict_8x8_h_lasx;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lasx;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lasx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lasx;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lasx;
}
#endif
}
void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] )
{
#if !HIGH_BIT_DEPTH
if( cpu&X264_CPU_LSX )
{
pf[I_PRED_4x4_V] = x264_predict_4x4_v_lsx;
pf[I_PRED_4x4_H] = x264_predict_4x4_h_lsx;
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_lsx;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_lsx;
pf[I_PRED_4x4_DC_LEFT]= x264_predict_4x4_dc_left_lsx;
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_lsx;
pf[I_PRED_4x4_DC_128] = x264_predict_4x4_dc_128_lsx;
}
#endif
}

150
common/loongarch/predict.h Normal file
View File

@@ -0,0 +1,150 @@
/*****************************************************************************
* predict.h: loongarch intra prediction
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_LOONGARCH_PREDICT_H
#define X264_LOONGARCH_PREDICT_H
#define x264_predict_8x8c_p_lsx x264_template(predict_8x8c_p_lsx)
void x264_predict_8x8c_p_lsx(uint8_t *p_src);
#define x264_predict_8x8c_v_lsx x264_template(predict_8x8c_v_lsx)
void x264_predict_8x8c_v_lsx(uint8_t *p_src);
#define x264_predict_8x8c_h_lsx x264_template(predict_8x8c_h_lsx)
void x264_predict_8x8c_h_lsx(uint8_t *p_src);
#define x264_predict_8x8c_dc_lsx x264_template(predict_8x8c_dc_lsx)
void x264_predict_8x8c_dc_lsx(pixel *src);
#define x264_predict_8x8c_dc_128_lsx x264_template(predict_8x8c_dc_128_lsx)
void x264_predict_8x8c_dc_128_lsx(pixel *src);
#define x264_predict_8x8c_dc_top_lsx x264_template(predict_8x8c_dc_top_lsx)
void x264_predict_8x8c_dc_top_lsx(pixel *src);
#define x264_predict_8x8c_dc_left_lsx x264_template(predict_8x8c_dc_left_lsx)
void x264_predict_8x8c_dc_left_lsx(pixel *src);
#define x264_predict_16x16_dc_lsx x264_template(predict_16x16_dc_lsx)
void x264_predict_16x16_dc_lsx( pixel *src );
#define x264_predict_16x16_dc_left_lsx x264_template(predict_16x16_dc_left_lsx)
void x264_predict_16x16_dc_left_lsx( pixel *src );
#define x264_predict_16x16_dc_top_lsx x264_template(predict_16x16_dc_top_lsx)
void x264_predict_16x16_dc_top_lsx( pixel *src );
#define x264_predict_16x16_dc_128_lsx x264_template(predict_16x16_dc_128_lsx)
void x264_predict_16x16_dc_128_lsx( pixel *src );
#define x264_predict_16x16_h_lsx x264_template(predict_16x16_h_lsx)
void x264_predict_16x16_h_lsx( pixel *src );
#define x264_predict_16x16_v_lsx x264_template(predict_16x16_v_lsx)
void x264_predict_16x16_v_lsx( pixel *src );
#define x264_predict_16x16_p_lasx x264_template(predict_16x16_p_lasx)
void x264_predict_16x16_p_lasx( pixel *src );
#define x264_predict_16x16_p_lsx x264_template(predict_16x16_p_lsx)
void x264_predict_16x16_p_lsx( pixel *src );
#define x264_predict_8x8_v_lsx x264_template(predict_8x8_v_lsx)
void x264_predict_8x8_v_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_h_lasx x264_template(predict_8x8_h_lasx)
void x264_predict_8x8_h_lasx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_h_lsx x264_template(predict_8x8_h_lsx)
void x264_predict_8x8_h_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_dc_lsx x264_template(predict_8x8_dc_lsx)
void x264_predict_8x8_dc_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_dc_left_lsx x264_template(predict_8x8_dc_left_lsx)
void x264_predict_8x8_dc_left_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_dc_top_lsx x264_template(predict_8x8_dc_top_lsx)
void x264_predict_8x8_dc_top_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_dc_128_lsx x264_template(predict_8x8_dc_128_lsx)
void x264_predict_8x8_dc_128_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddl_lasx x264_template(predict_8x8_ddl_lasx)
void x264_predict_8x8_ddl_lasx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddl_lsx x264_template(predict_8x8_ddl_lsx)
void x264_predict_8x8_ddl_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddr_lasx x264_template(predict_8x8_ddr_lasx)
void x264_predict_8x8_ddr_lasx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddr_lsx x264_template(predict_8x8_ddr_lsx)
void x264_predict_8x8_ddr_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vr_lasx x264_template(predict_8x8_vr_lasx)
void x264_predict_8x8_vr_lasx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vr_lsx x264_template(predict_8x8_vr_lsx)
void x264_predict_8x8_vr_lsx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vl_lasx x264_template(predict_8x8_vl_lasx)
void x264_predict_8x8_vl_lasx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vl_lsx x264_template(predict_8x8_vl_lsx)
void x264_predict_8x8_vl_lsx( pixel *src, pixel edge[36] );
#define x264_predict_4x4_v_lsx x264_template(predict_4x4_v_lsx)
void x264_predict_4x4_v_lsx( pixel *p_src );
#define x264_predict_4x4_h_lsx x264_template(predict_4x4_h_lsx)
void x264_predict_4x4_h_lsx( pixel *p_src );
#define x264_predict_4x4_dc_lsx x264_template(predict_4x4_dc_lsx)
void x264_predict_4x4_dc_lsx( pixel *p_src );
#define x264_predict_4x4_ddl_lsx x264_template(predict_4x4_ddl_lsx)
void x264_predict_4x4_ddl_lsx( pixel *p_src );
#define x264_predict_4x4_dc_top_lsx x264_template(predict_4x4_dc_top_lsx)
void x264_predict_4x4_dc_top_lsx( pixel *p_src );
#define x264_predict_4x4_dc_left_lsx x264_template(predict_4x4_dc_left_lsx)
void x264_predict_4x4_dc_left_lsx( pixel *p_src );
#define x264_predict_4x4_dc_128_lsx x264_template(predict_4x4_dc_128_lsx)
void x264_predict_4x4_dc_128_lsx( pixel *p_src );
#define x264_predict_4x4_init_loongarch x264_template(predict_4x4_init_loongarch)
void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] );
#define x264_predict_8x8_init_loongarch x264_template(predict_8x8_init_loongarch)
void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12],
x264_predict_8x8_filter_t *predict_filter );
#define x264_predict_8x8c_init_loongarch x264_template(predict_8x8c_init_loongarch)
void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] );
#define x264_predict_16x16_init_loongarch x264_template(predict_16x16_init_loongarch)
void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] );
#endif

1231
common/loongarch/quant-a.S Normal file

File diff suppressed because it is too large Load Diff

96
common/loongarch/quant.h Normal file
View File

@@ -0,0 +1,96 @@
/*****************************************************************************
* quant.h: loongarch quantization and level-run
*****************************************************************************
* Copyright (C) 2023-2025 x264 project
*
* Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_LOONGARCH_QUANT_H
#define X264_LOONGARCH_QUANT_H
#define x264_coeff_last64_lsx x264_template(coeff_last64_lsx)
int32_t x264_coeff_last64_lsx( int16_t *p_src );
#define x264_coeff_last16_lsx x264_template(coeff_last16_lsx)
int32_t x264_coeff_last16_lsx( int16_t *p_src );
#define x264_coeff_last15_lsx x264_template(coeff_last15_lsx)
int32_t x264_coeff_last15_lsx( int16_t *p_src );
#define x264_coeff_last8_lsx x264_template(coeff_last8_lsx)
int32_t x264_coeff_last8_lsx( int16_t *p_src );
#define x264_coeff_last4_lsx x264_template(coeff_last4_lsx)
int32_t x264_coeff_last4_lsx( int16_t *p_src );
#define x264_quant_4x4_lsx x264_template(quant_4x4_lsx)
int32_t x264_quant_4x4_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
#define x264_quant_4x4x4_lsx x264_template(quant_4x4x4_lsx)
int32_t x264_quant_4x4x4_lsx( int16_t p_dct[4][16],
uint16_t pu_mf[16], uint16_t pu_bias[16] );
#define x264_quant_8x8_lsx x264_template(quant_8x8_lsx)
int32_t x264_quant_8x8_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
#define x264_quant_4x4_dc_lsx x264_template(quant_4x4_dc_lsx)
int32_t x264_quant_4x4_dc_lsx( dctcoef dct[16], int32_t mf, int32_t bias );
#define x264_quant_2x2_dc_lsx x264_template(quant_2x2_dc_lsx)
int32_t x264_quant_2x2_dc_lsx( dctcoef dct[4], int32_t mf, int32_t bias );
#define x264_dequant_4x4_lsx x264_template(dequant_4x4_lsx)
void x264_dequant_4x4_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_lsx x264_template(dequant_8x8_lsx)
void x264_dequant_8x8_lsx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_dc_lsx x264_template(dequant_4x4_dc_lsx)
void x264_dequant_4x4_dc_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_decimate_score15_lsx x264_template(decimate_score15_lsx)
int x264_decimate_score15_lsx( dctcoef *dct );
#define x264_decimate_score16_lsx x264_template(decimate_score16_lsx)
int x264_decimate_score16_lsx( dctcoef *dct );
#define x264_decimate_score64_lsx x264_template(decimate_score64_lsx)
int x264_decimate_score64_lsx( dctcoef *dct );
#define x264_coeff_last64_lasx x264_template(coeff_last64_lasx)
int32_t x264_coeff_last64_lasx( int16_t *p_src );
#define x264_coeff_last16_lasx x264_template(coeff_last16_lasx)
int32_t x264_coeff_last16_lasx( int16_t *p_src );
#define x264_coeff_last15_lasx x264_template(coeff_last15_lasx)
int32_t x264_coeff_last15_lasx( int16_t *p_src );
#define x264_quant_4x4x4_lasx x264_template(quant_4x4x4_lasx)
int32_t x264_quant_4x4x4_lasx( int16_t p_dct[4][16],
uint16_t pu_mf[16], uint16_t pu_bias[16] );
#define x264_dequant_4x4_lasx x264_template(dequant_4x4_lasx)
void x264_dequant_4x4_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_lasx x264_template(dequant_8x8_lasx)
void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
#endif/* X264_LOONGARCH_QUANT_H */

2585
common/loongarch/sad-a.S Normal file

File diff suppressed because it is too large Load Diff

1926
common/macroblock.c Normal file

File diff suppressed because it is too large Load Diff

463
common/macroblock.h Normal file
View File

@@ -0,0 +1,463 @@
/*****************************************************************************
* macroblock.h: macroblock common functions
*****************************************************************************
* Copyright (C) 2005-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_MACROBLOCK_H
#define X264_MACROBLOCK_H
enum macroblock_position_e
{
MB_LEFT = 0x01,
MB_TOP = 0x02,
MB_TOPRIGHT = 0x04,
MB_TOPLEFT = 0x08,
MB_PRIVATE = 0x10,
ALL_NEIGHBORS = 0xf,
};
static const uint8_t x264_pred_i4x4_neighbors[12] =
{
MB_TOP, // I_PRED_4x4_V
MB_LEFT, // I_PRED_4x4_H
MB_LEFT | MB_TOP, // I_PRED_4x4_DC
MB_TOP | MB_TOPRIGHT, // I_PRED_4x4_DDL
MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_DDR
MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_VR
MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_HD
MB_TOP | MB_TOPRIGHT, // I_PRED_4x4_VL
MB_LEFT, // I_PRED_4x4_HU
MB_LEFT, // I_PRED_4x4_DC_LEFT
MB_TOP, // I_PRED_4x4_DC_TOP
0 // I_PRED_4x4_DC_128
};
/* XXX mb_type isn't the one written in the bitstream -> only internal usage */
#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 || (type) == I_PCM )
#define IS_SKIP(type) ( (type) == P_SKIP || (type) == B_SKIP )
#define IS_DIRECT(type) ( (type) == B_DIRECT )
enum mb_class_e
{
I_4x4 = 0,
I_8x8 = 1,
I_16x16 = 2,
I_PCM = 3,
P_L0 = 4,
P_8x8 = 5,
P_SKIP = 6,
B_DIRECT = 7,
B_L0_L0 = 8,
B_L0_L1 = 9,
B_L0_BI = 10,
B_L1_L0 = 11,
B_L1_L1 = 12,
B_L1_BI = 13,
B_BI_L0 = 14,
B_BI_L1 = 15,
B_BI_BI = 16,
B_8x8 = 17,
B_SKIP = 18,
X264_MBTYPE_MAX = 19
};
static const uint8_t x264_mb_type_fix[X264_MBTYPE_MAX] =
{
I_4x4, I_4x4, I_16x16, I_PCM,
P_L0, P_8x8, P_SKIP,
B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1,
B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP
};
static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] =
{
{{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, /* INTRA */
{{1,1},{0,0}}, /* P_L0 */
{{0,0},{0,0}}, /* P_8x8 */
{{1,1},{0,0}}, /* P_SKIP */
{{0,0},{0,0}}, /* B_DIRECT */
{{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}}, /* B_L0_* */
{{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}}, /* B_L1_* */
{{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}}, /* B_BI_* */
{{0,0},{0,0}}, /* B_8x8 */
{{0,0},{0,0}} /* B_SKIP */
};
#define IS_SUB4x4(type) ( (type == D_L0_4x4)||(type == D_L1_4x4)||(type == D_BI_4x4) )
#define IS_SUB4x8(type) ( (type == D_L0_4x8)||(type == D_L1_4x8)||(type == D_BI_4x8) )
#define IS_SUB8x4(type) ( (type == D_L0_8x4)||(type == D_L1_8x4)||(type == D_BI_8x4) )
#define IS_SUB8x8(type) ( (type == D_L0_8x8)||(type == D_L1_8x8)||(type == D_BI_8x8)||(type == D_DIRECT_8x8) )
enum mb_partition_e
{
/* sub partition type for P_8x8 and B_8x8 */
D_L0_4x4 = 0,
D_L0_8x4 = 1,
D_L0_4x8 = 2,
D_L0_8x8 = 3,
/* sub partition type for B_8x8 only */
D_L1_4x4 = 4,
D_L1_8x4 = 5,
D_L1_4x8 = 6,
D_L1_8x8 = 7,
D_BI_4x4 = 8,
D_BI_8x4 = 9,
D_BI_4x8 = 10,
D_BI_8x8 = 11,
D_DIRECT_8x8 = 12,
/* partition */
D_8x8 = 13,
D_16x8 = 14,
D_8x16 = 15,
D_16x16 = 16,
X264_PARTTYPE_MAX = 17,
};
static const uint8_t x264_mb_partition_listX_table[2][17] =
{{
1, 1, 1, 1, /* D_L0_* */
0, 0, 0, 0, /* D_L1_* */
1, 1, 1, 1, /* D_BI_* */
0, /* D_DIRECT_8x8 */
0, 0, 0, 0 /* 8x8 .. 16x16 */
},
{
0, 0, 0, 0, /* D_L0_* */
1, 1, 1, 1, /* D_L1_* */
1, 1, 1, 1, /* D_BI_* */
0, /* D_DIRECT_8x8 */
0, 0, 0, 0 /* 8x8 .. 16x16 */
}};
static const uint8_t x264_mb_partition_count_table[17] =
{
/* sub L0 */
4, 2, 2, 1,
/* sub L1 */
4, 2, 2, 1,
/* sub BI */
4, 2, 2, 1,
/* Direct */
1,
/* Partition */
4, 2, 2, 1
};
static const uint8_t x264_mb_partition_pixel_table[17] =
{
PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_L0_* */
PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_L1_* */
PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_BI_* */
PIXEL_8x8, /* D_DIRECT_8x8 */
PIXEL_8x8, PIXEL_16x8, PIXEL_8x16, PIXEL_16x16, /* 8x8 .. 16x16 */
};
/* zigzags are transposed with respect to the tables in the standard */
static const uint8_t x264_zigzag_scan4[2][16] =
{{ // frame
0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15
},
{ // field
0, 1, 4, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
}};
static const uint8_t x264_zigzag_scan8[2][64] =
{{
0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
},
{
0, 1, 2, 8, 9, 3, 4, 10, 16, 11, 5, 6, 7, 12, 17, 24,
18, 13, 14, 15, 19, 25, 32, 26, 20, 21, 22, 23, 27, 33, 40, 34,
28, 29, 30, 31, 35, 41, 48, 42, 36, 37, 38, 39, 43, 49, 50, 44,
45, 46, 47, 51, 56, 57, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63
}};
static const uint8_t block_idx_x[16] =
{
0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
};
static const uint8_t block_idx_y[16] =
{
0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
};
static const uint8_t block_idx_xy[4][4] =
{
{ 0, 2, 8, 10 },
{ 1, 3, 9, 11 },
{ 4, 6, 12, 14 },
{ 5, 7, 13, 15 }
};
static const uint8_t block_idx_xy_1d[16] =
{
0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
};
static const uint8_t block_idx_yx_1d[16] =
{
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
};
static const uint8_t block_idx_xy_fenc[16] =
{
0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
};
static const uint16_t block_idx_xy_fdec[16] =
{
0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
};
#define QP(qP) ( (qP)+QP_BD_OFFSET )
static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
{
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
#if BIT_DEPTH > 9
QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
#endif
#if BIT_DEPTH > 8
QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
#endif
QP(0), QP(1), QP(2), QP(3), QP(4), QP(5),
QP(6), QP(7), QP(8), QP(9), QP(10), QP(11),
QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
QP(39), QP(39), QP(39), QP(39),
QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
};
#undef QP
enum cabac_ctx_block_cat_e
{
DCT_LUMA_DC = 0,
DCT_LUMA_AC = 1,
DCT_LUMA_4x4 = 2,
DCT_CHROMA_DC = 3,
DCT_CHROMA_AC = 4,
DCT_LUMA_8x8 = 5,
DCT_CHROMAU_DC = 6,
DCT_CHROMAU_AC = 7,
DCT_CHROMAU_4x4 = 8,
DCT_CHROMAU_8x8 = 9,
DCT_CHROMAV_DC = 10,
DCT_CHROMAV_AC = 11,
DCT_CHROMAV_4x4 = 12,
DCT_CHROMAV_8x8 = 13,
};
static const uint8_t ctx_cat_plane[6][3] =
{
{ DCT_LUMA_DC, DCT_CHROMAU_DC, DCT_CHROMAV_DC},
{ DCT_LUMA_AC, DCT_CHROMAU_AC, DCT_CHROMAV_AC},
{DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4},
{0},
{0},
{DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8}
};
/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
#define x264_macroblock_cache_allocate x264_template(macroblock_cache_allocate)
int x264_macroblock_cache_allocate( x264_t *h );
#define x264_macroblock_cache_free x264_template(macroblock_cache_free)
void x264_macroblock_cache_free( x264_t *h );
/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
#define x264_macroblock_thread_allocate x264_template(macroblock_thread_allocate)
int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
#define x264_macroblock_thread_free x264_template(macroblock_thread_free)
void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
#define x264_macroblock_slice_init x264_template(macroblock_slice_init)
void x264_macroblock_slice_init( x264_t *h );
#define x264_macroblock_thread_init x264_template(macroblock_thread_init)
void x264_macroblock_thread_init( x264_t *h );
#define x264_macroblock_cache_load_interlaced x264_template(macroblock_cache_load_interlaced)
void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y );
#define x264_macroblock_cache_load_progressive x264_template(macroblock_cache_load_progressive)
void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y );
#define x264_macroblock_deblock_strength x264_template(macroblock_deblock_strength)
void x264_macroblock_deblock_strength( x264_t *h );
#define x264_macroblock_cache_save x264_template(macroblock_cache_save)
void x264_macroblock_cache_save( x264_t *h );
#define x264_macroblock_bipred_init x264_template(macroblock_bipred_init)
void x264_macroblock_bipred_init( x264_t *h );
#define x264_prefetch_fenc x264_template(prefetch_fenc)
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
#define x264_copy_column8 x264_template(copy_column8)
void x264_copy_column8( pixel *dst, pixel *src );
/* x264_mb_predict_mv_16x16:
* set mvp with predicted mv for D_16x16 block
* h->mb. need only valid values from other blocks */
#define x264_mb_predict_mv_16x16 x264_template(mb_predict_mv_16x16)
void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] );
/* x264_mb_predict_mv_pskip:
* set mvp with predicted mv for P_SKIP
* h->mb. need only valid values from other blocks */
#define x264_mb_predict_mv_pskip x264_template(mb_predict_mv_pskip)
void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] );
/* x264_mb_predict_mv:
* set mvp with predicted mv for all blocks except SKIP and DIRECT
* h->mb. need valid ref/partition/sub of current block to be valid
* and valid mv/ref from other blocks. */
#define x264_mb_predict_mv x264_template(mb_predict_mv)
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] );
/* x264_mb_predict_mv_direct16x16:
* set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT
* h->mb. need only valid values from other blocks.
* return 1 on success, 0 on failure.
* if b_changed != NULL, set it to whether refs or mvs differ from
* before this functioncall. */
#define x264_mb_predict_mv_direct16x16 x264_template(mb_predict_mv_direct16x16)
int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
/* x264_mb_predict_mv_ref16x16:
* set mvc with D_16x16 prediction.
* uses all neighbors, even those that didn't end up using this ref.
* h->mb. need only valid values from other blocks */
#define x264_mb_predict_mv_ref16x16 x264_template(mb_predict_mv_ref16x16)
void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc );
#define x264_mb_mc x264_template(mb_mc)
void x264_mb_mc( x264_t *h );
#define x264_mb_mc_8x8 x264_template(mb_mc_8x8)
void x264_mb_mc_8x8( x264_t *h, int i8 );
static ALWAYS_INLINE uint32_t pack16to32( uint32_t a, uint32_t b )
{
#if WORDS_BIGENDIAN
return b + (a<<16);
#else
return a + (b<<16);
#endif
}
static ALWAYS_INLINE uint32_t pack8to16( uint32_t a, uint32_t b )
{
#if WORDS_BIGENDIAN
return b + (a<<8);
#else
return a + (b<<8);
#endif
}
static ALWAYS_INLINE uint32_t pack8to32( uint32_t a, uint32_t b, uint32_t c, uint32_t d )
{
#if WORDS_BIGENDIAN
return d + (c<<8) + (b<<16) + (a<<24);
#else
return a + (b<<8) + (c<<16) + (d<<24);
#endif
}
static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
{
#if WORDS_BIGENDIAN
return (b&0xFFFF) + ((uint32_t)a<<16);
#else
return (a&0xFFFF) + ((uint32_t)b<<16);
#endif
}
static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
{
#if WORDS_BIGENDIAN
return b + ((uint64_t)a<<32);
#else
return a + ((uint64_t)b<<32);
#endif
}
#if HIGH_BIT_DEPTH
# define pack_pixel_1to2 pack16to32
# define pack_pixel_2to4 pack32to64
#else
# define pack_pixel_1to2 pack8to16
# define pack_pixel_2to4 pack16to32
#endif
static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
{
const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
x264_mb_pred_mode4x4_fix(mb) );
if( m < 0 )
return I_PRED_4x4_DC;
return m;
}
static ALWAYS_INLINE int x264_mb_predict_non_zero_code( x264_t *h, int idx )
{
const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
int i_ret = za + zb;
if( i_ret < 0x80 )
i_ret = ( i_ret + 1 ) >> 1;
return i_ret & 0x7f;
}
/* intra and skip are disallowed, p8x8 is conditional. */
static const uint8_t x264_transform_allowed[X264_MBTYPE_MAX] =
{
0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0
};
/* x264_mb_transform_8x8_allowed:
* check whether any partition is smaller than 8x8 (or at least
* might be, according to just partition type.)
* doesn't check for cbp */
static ALWAYS_INLINE int x264_mb_transform_8x8_allowed( x264_t *h )
{
if( !h->pps->b_transform_8x8_mode )
return 0;
if( h->mb.i_type != P_8x8 )
return x264_transform_allowed[h->mb.i_type];
return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
}
#endif

784
common/mc.c Normal file
View File

@@ -0,0 +1,784 @@
/*****************************************************************************
* mc.c: motion compensation
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common.h"
#if HAVE_MMX
#include "x86/mc.h"
#endif
#if HAVE_ALTIVEC
#include "ppc/mc.h"
#endif
#if HAVE_ARMV6
#include "arm/mc.h"
#endif
#if HAVE_AARCH64
#include "aarch64/mc.h"
#endif
#if HAVE_MSA
#include "mips/mc.h"
#endif
#if HAVE_LSX
# include "loongarch/mc.h"
#endif
static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride,
pixel *src1, intptr_t i_src1_stride,
pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
{
for( int x = 0; x < i_width; x++ )
dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
dst += i_dst_stride;
src1 += i_src1_stride;
src2 += i_src2_stride;
}
}
static inline void pixel_avg_wxh( pixel *dst, intptr_t i_dst,
pixel *src1, intptr_t i_src1,
pixel *src2, intptr_t i_src2, int width, int height )
{
for( int y = 0; y < height; y++ )
{
for( int x = 0; x < width; x++ )
dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
src1 += i_src1;
src2 += i_src2;
dst += i_dst;
}
}
/* Implicit weighted bipred only:
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
static inline void pixel_avg_weight_wxh( pixel *dst, intptr_t i_dst,
pixel *src1, intptr_t i_src1,
pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
{
int i_weight2 = 64 - i_weight1;
for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
for( int x = 0; x<width; x++ )
dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
}
#undef op_scale2
#define PIXEL_AVG_C( name, width, height ) \
static void name( pixel *pix1, intptr_t i_stride_pix1, \
pixel *pix2, intptr_t i_stride_pix2, \
pixel *pix3, intptr_t i_stride_pix3, int weight ) \
{ \
if( weight == 32 ) \
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
else \
pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
}
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
PIXEL_AVG_C( pixel_avg_4x16, 4, 16 )
PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
PIXEL_AVG_C( pixel_avg_2x8, 2, 8 )
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
static void weight_cache( x264_t *h, x264_weight_t *w )
{
w->weightfn = h->mc.weight;
}
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
const x264_weight_t *weight, int i_width, int i_height )
{
int offset = weight->i_offset * (1 << (BIT_DEPTH-8));
int scale = weight->i_scale;
int denom = weight->i_denom;
if( denom >= 1 )
{
for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
for( int x = 0; x < i_width; x++ )
opscale( x );
}
else
{
for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
for( int x = 0; x < i_width; x++ )
opscale_noden( x );
}
}
#define MC_WEIGHT_C( name, width ) \
static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
{ \
mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
}
MC_WEIGHT_C( mc_weight_w20, 20 )
MC_WEIGHT_C( mc_weight_w16, 16 )
MC_WEIGHT_C( mc_weight_w12, 12 )
MC_WEIGHT_C( mc_weight_w8, 8 )
MC_WEIGHT_C( mc_weight_w4, 4 )
MC_WEIGHT_C( mc_weight_w2, 2 )
static weight_fn_t mc_weight_wtab[6] =
{
mc_weight_w2,
mc_weight_w4,
mc_weight_w8,
mc_weight_w12,
mc_weight_w16,
mc_weight_w20,
};
static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
{
memcpy( dst, src, i_width * SIZEOF_PIXEL );
src += i_src_stride;
dst += i_dst_stride;
}
}
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
intptr_t stride, int width, int height, int16_t *buf )
{
const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
for( int y = 0; y < height; y++ )
{
for( int x = -2; x < width+3; x++ )
{
int v = TAPFILTER(src,stride);
dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
/* transform v for storage in a 16-bit integer */
buf[x+2] = v + pad;
}
for( int x = 0; x < width; x++ )
dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
for( int x = 0; x < width; x++ )
dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
dsth += stride;
dstv += stride;
dstc += stride;
src += stride;
}
}
static void mc_luma( pixel *dst, intptr_t i_dst_stride,
pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
if( weight->weightfn )
mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
}
else if( weight->weightfn )
mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
else
mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}
static pixel *get_ref( pixel *dst, intptr_t *i_dst_stride,
pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
if( weight->weightfn )
mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
return dst;
}
else if( weight->weightfn )
{
mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
return dst;
}
else
{
*i_dst_stride = i_src_stride;
return src1;
}
}
/* full chroma mc (ie until 1/8 pixel)*/
static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
pixel *src, intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
pixel *srcp;
int d8x = mvx&0x07;
int d8y = mvy&0x07;
int cA = (8-d8x)*(8-d8y);
int cB = d8x *(8-d8y);
int cC = (8-d8x)*d8y;
int cD = d8x *d8y;
src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
srcp = &src[i_src_stride];
for( int y = 0; y < i_height; y++ )
{
for( int x = 0; x < i_width; x++ )
{
dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] +
cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] +
cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
}
dstu += i_dst_stride;
dstv += i_dst_stride;
src = srcp;
srcp += i_src_stride;
}
}
#define MC_COPY(W) \
static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
{ \
mc_copy( src, i_src, dst, i_dst, W, i_height ); \
}
MC_COPY( 16 )
MC_COPY( 8 )
MC_COPY( 4 )
void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h )
{
while( h-- )
{
memcpy( dst, src, w * SIZEOF_PIXEL );
dst += i_dst;
src += i_src;
}
}
void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h )
{
for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
for( int x=0; x<2*w; x+=2 )
{
dst[x] = src[x+1];
dst[x+1] = src[x];
}
}
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h )
{
for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
for( int x=0; x<w; x++ )
{
dst[2*x] = srcu[x];
dst[2*x+1] = srcv[x];
}
}
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h )
{
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, src+=i_src )
for( int x=0; x<w; x++ )
{
dsta[x] = src[2*x];
dstb[x] = src[2*x+1];
}
}
static void plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h )
{
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
{
for( int x=0; x<w; x++ )
{
dsta[x] = src[x*pw];
dstb[x] = src[x*pw+1];
dstc[x] = src[x*pw+2];
}
}
}
#if WORDS_BIGENDIAN
static ALWAYS_INLINE uint32_t v210_endian_fix32( uint32_t x )
{
return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
}
#else
#define v210_endian_fix32(x) (x)
#endif
static void plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h )
{
for( int l = 0; l < h; l++ )
{
pixel *dsty0 = dsty;
pixel *dstc0 = dstc;
uint32_t *src0 = src;
for( int n = 0; n < w; n += 3 )
{
uint32_t s = v210_endian_fix32( *src0++ );
*dstc0++ = s & 0x03FF;
*dsty0++ = (s >> 10) & 0x03FF;
*dstc0++ = (s >> 20) & 0x03FF;
s = v210_endian_fix32( *src0++ );
*dsty0++ = s & 0x03FF;
*dstc0++ = (s >> 10) & 0x03FF;
*dsty0++ = (s >> 20) & 0x03FF;
}
dsty += i_dsty;
dstc += i_dstc;
src += i_src;
}
}
static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
{
for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
for( int x=0; x<8; x++ )
{
dst[2*x] = srcu[x];
dst[2*x+1] = srcv[x];
}
}
static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
{
x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
}
static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
{
x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
}
static void prefetch_fenc_null( pixel *pix_y, intptr_t stride_y,
pixel *pix_uv, intptr_t stride_uv, int mb_x )
{}
static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
{}
static void memzero_aligned( void * dst, size_t n )
{
memset( dst, 0, n );
}
static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
{
int v = pix[0]+pix[1]+pix[2]+pix[3];
for( int x = 0; x < stride-4; x++ )
{
sum[x] = (uint16_t)(v + sum[x-stride]);
v += pix[x+4] - pix[x];
}
}
static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
{
int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
for( int x = 0; x < stride-8; x++ )
{
sum[x] = (uint16_t)(v + sum[x-stride]);
v += pix[x+8] - pix[x];
}
}
static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
{
for( int x = 0; x < stride-8; x++ )
sum4[x] = (uint16_t)(sum8[x+4*stride] - sum8[x]);
for( int x = 0; x < stride-8; x++ )
sum8[x] = (uint16_t)(sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4]);
}
static void integral_init8v( uint16_t *sum8, intptr_t stride )
{
for( int x = 0; x < stride-8; x++ )
sum8[x] = (uint16_t)(sum8[x+8*stride] - sum8[x]);
}
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
pixel *src = frame->plane[0];
int i_stride = frame->i_stride[0];
int i_height = frame->i_lines[0];
int i_width = frame->i_width[0];
// duplicate last row and column so that their interpolation doesn't have to be special-cased
for( int y = 0; y < i_height; y++ )
src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * SIZEOF_PIXEL );
h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
x264_frame_expand_border_lowres( frame );
memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );
for( int y = 0; y < h->param.i_bframe + 2; y++ )
for( int x = 0; x < h->param.i_bframe + 2; x++ )
frame->i_row_satds[y][x][0] = -1;
for( int y = 0; y <= !!h->param.i_bframe; y++ )
for( int x = 0; x <= h->param.i_bframe; x++ )
frame->lowres_mvs[y][x][0][0] = 0x7FFF;
}
static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height )
{
for( int y = 0; y < height; y++ )
{
pixel *src1 = src0+src_stride;
pixel *src2 = src1+src_stride;
for( int x = 0; x<width; x++ )
{
// slower than naive bilinear, but matches asm
#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1], src1[2*x+1]);
dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1], src2[2*x+1]);
dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
#undef FILTER
}
src0 += src_stride*2;
dst0 += dst_stride;
dsth += dst_stride;
dstv += dst_stride;
dstc += dst_stride;
}
}
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given macroblock. */
static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
{
float fps = *fps_factor;
for( int i = 0; i < len; i++ )
{
int intra_cost = intra_costs[i];
int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
float propagate_intra = intra_cost * inv_qscales[i];
float propagate_amount = propagate_in[i] + propagate_intra*fps;
float propagate_num = intra_cost - inter_cost;
float propagate_denom = intra_cost;
dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
}
}
static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list )
{
unsigned stride = h->mb.i_mb_stride;
unsigned width = h->mb.i_mb_width;
unsigned height = h->mb.i_mb_height;
for( int i = 0; i < len; i++ )
{
int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
if( !(lists_used & (1 << list)) )
continue;
int listamount = propagate_amount[i];
/* Apply bipred weighting. */
if( lists_used == 3 )
listamount = (listamount * bipred_weight + 32) >> 6;
/* Early termination for simple case of mv0. */
if( !M32( mvs[i] ) )
{
MC_CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
continue;
}
int x = mvs[i][0];
int y = mvs[i][1];
unsigned mbx = (unsigned)((x>>5)+i);
unsigned mby = (unsigned)((y>>5)+mb_y);
unsigned idx0 = mbx + mby * stride;
unsigned idx2 = idx0 + stride;
x &= 31;
y &= 31;
int idx0weight = (32-y)*(32-x);
int idx1weight = (32-y)*x;
int idx2weight = y*(32-x);
int idx3weight = y*x;
idx0weight = (idx0weight * listamount + 512) >> 10;
idx1weight = (idx1weight * listamount + 512) >> 10;
idx2weight = (idx2weight * listamount + 512) >> 10;
idx3weight = (idx3weight * listamount + 512) >> 10;
if( mbx < width-1 && mby < height-1 )
{
MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
}
else
{
/* Note: this takes advantage of unsigned representation to
* catch negative mbx/mby. */
if( mby < height )
{
if( mbx < width )
MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
if( mbx+1 < width )
MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
}
if( mby+1 < height )
{
if( mbx < width )
MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
if( mbx+1 < width )
MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
}
}
}
}
/* Conversion between float and Q8.8 fixed point (big-endian) for storage */
static void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
{
for( int i = 0; i < count; i++ )
dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) );
}
static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
{
for( int i = 0; i < count; i++ )
dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f);
}
void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent )
{
pf->mc_luma = mc_luma;
pf->get_ref = get_ref;
pf->mc_chroma = mc_chroma;
pf->avg[PIXEL_16x16]= pixel_avg_16x16;
pf->avg[PIXEL_16x8] = pixel_avg_16x8;
pf->avg[PIXEL_8x16] = pixel_avg_8x16;
pf->avg[PIXEL_8x8] = pixel_avg_8x8;
pf->avg[PIXEL_8x4] = pixel_avg_8x4;
pf->avg[PIXEL_4x16] = pixel_avg_4x16;
pf->avg[PIXEL_4x8] = pixel_avg_4x8;
pf->avg[PIXEL_4x4] = pixel_avg_4x4;
pf->avg[PIXEL_4x2] = pixel_avg_4x2;
pf->avg[PIXEL_2x8] = pixel_avg_2x8;
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
pf->weight = mc_weight_wtab;
pf->offsetadd = mc_weight_wtab;
pf->offsetsub = mc_weight_wtab;
pf->weight_cache = weight_cache;
pf->copy_16x16_unaligned = mc_copy_w16;
pf->copy[PIXEL_16x16] = mc_copy_w16;
pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4;
pf->store_interleave_chroma = store_interleave_chroma;
pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
pf->plane_copy = x264_plane_copy_c;
pf->plane_copy_swap = x264_plane_copy_swap_c;
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_c;
pf->plane_copy_deinterleave_v210 = plane_copy_deinterleave_v210_c;
pf->hpel_filter = hpel_filter;
pf->prefetch_fenc_400 = prefetch_fenc_null;
pf->prefetch_fenc_420 = prefetch_fenc_null;
pf->prefetch_fenc_422 = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
pf->memcpy_aligned = memcpy;
pf->memzero_aligned = memzero_aligned;
pf->frame_init_lowres_core = frame_init_lowres_core;
pf->integral_init4h = integral_init4h;
pf->integral_init8h = integral_init8h;
pf->integral_init4v = integral_init4v;
pf->integral_init8v = integral_init8v;
pf->mbtree_propagate_cost = mbtree_propagate_cost;
pf->mbtree_propagate_list = mbtree_propagate_list;
pf->mbtree_fix8_pack = mbtree_fix8_pack;
pf->mbtree_fix8_unpack = mbtree_fix8_unpack;
#if HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
x264_mc_init_altivec( pf );
#endif
#if HAVE_ARMV6
x264_mc_init_arm( cpu, pf );
#endif
#if HAVE_AARCH64
x264_mc_init_aarch64( cpu, pf );
#endif
#if HAVE_MSA
if( cpu&X264_CPU_MSA )
x264_mc_init_mips( cpu, pf );
#endif
#if HAVE_LSX
x264_mc_init_loongarch( cpu, pf );
#endif
if( cpu_independent )
{
pf->mbtree_propagate_cost = mbtree_propagate_cost;
pf->mbtree_propagate_list = mbtree_propagate_list;
}
}
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
{
const int b_interlaced = PARAM_INTERLACED;
int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
if( mb_y & b_interlaced )
return;
for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
{
int stride = frame->i_stride[p];
const int width = frame->i_width[p];
int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
if( !b_interlaced || h->mb.b_adaptive_mbaff )
h->mc.hpel_filter(
frame->filtered[p][1] + offs,
frame->filtered[p][2] + offs,
frame->filtered[p][3] + offs,
frame->plane[p] + offs,
stride, width + 16, height - start,
h->scratch_buffer );
if( b_interlaced )
{
/* MC must happen between pixels in the same field. */
stride = frame->i_stride[p] << 1;
start = (mb_y*16 >> 1) - 8;
int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
offs = start*stride - 8;
for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
{
h->mc.hpel_filter(
frame->filtered_fld[p][1] + offs,
frame->filtered_fld[p][2] + offs,
frame->filtered_fld[p][3] + offs,
frame->plane_fld[p] + offs,
stride, width + 16, height_fld - start,
h->scratch_buffer );
}
}
}
/* generate integral image:
* frame->integral contains 2 planes. in the upper plane, each element is
* the sum of an 8x8 pixel region with top-left corner on that point.
* in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
if( frame->integral )
{
int stride = frame->i_stride[0];
if( start < 0 )
{
memset( frame->integral - PADV * stride - PADH_ALIGN, 0, stride * sizeof(uint16_t) );
start = -PADV;
}
if( b_end )
height += PADV-9;
for( int y = start; y < height; y++ )
{
pixel *pix = frame->plane[0] + y * stride - PADH_ALIGN;
uint16_t *sum8 = frame->integral + (y+1) * stride - PADH_ALIGN;
uint16_t *sum4;
if( h->frames.b_have_sub8x8_esa )
{
h->mc.integral_init4h( sum8, pix, stride );
sum8 -= 8*stride;
sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
if( y >= 8-PADV )
h->mc.integral_init4v( sum8, sum4, stride );
}
else
{
h->mc.integral_init8h( sum8, pix, stride );
if( y >= 8-PADV )
h->mc.integral_init8v( sum8-8*stride, stride );
}
}
}
}

345
common/mc.h Normal file
View File

@@ -0,0 +1,345 @@
/*****************************************************************************
* mc.h: motion compensation
*****************************************************************************
* Copyright (C) 2004-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_MC_H
#define X264_MC_H
#define MC_CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
#define MC_CLIP_ADD2(s,x)\
do\
{\
MC_CLIP_ADD((s)[0], (x)[0]);\
MC_CLIP_ADD((s)[1], (x)[1]);\
} while( 0 )
#define x264_mbtree_propagate_list_internal_neon x264_template(mbtree_propagate_list_internal_neon)
#define PROPAGATE_LIST(cpu)\
void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
uint16_t *lowres_costs, int16_t *output,\
int bipred_weight, int mb_y, int len );\
\
static void mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
int16_t *propagate_amount, uint16_t *lowres_costs,\
int bipred_weight, int mb_y, int len, int list )\
{\
int16_t *current = h->scratch_buffer2;\
\
x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
current, bipred_weight, mb_y, len );\
\
unsigned stride = h->mb.i_mb_stride;\
unsigned width = h->mb.i_mb_width;\
unsigned height = h->mb.i_mb_height;\
\
for( int i = 0; i < len; current += 32 )\
{\
int end = X264_MIN( i+8, len );\
for( ; i < end; i++, current += 2 )\
{\
if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
continue;\
\
unsigned mbx = (unsigned)current[0];\
unsigned mby = (unsigned)current[1];\
unsigned idx0 = mbx + mby * stride;\
unsigned idx2 = idx0 + stride;\
\
/* Shortcut for the simple/common case of zero MV */\
if( !M32( mvs[i] ) )\
{\
MC_CLIP_ADD( ref_costs[idx0], current[16] );\
continue;\
}\
\
if( mbx < width-1 && mby < height-1 )\
{\
MC_CLIP_ADD2( ref_costs+idx0, current+16 );\
MC_CLIP_ADD2( ref_costs+idx2, current+32 );\
}\
else\
{\
/* Note: this takes advantage of unsigned representation to\
* catch negative mbx/mby. */\
if( mby < height )\
{\
if( mbx < width )\
MC_CLIP_ADD( ref_costs[idx0+0], current[16] );\
if( mbx+1 < width )\
MC_CLIP_ADD( ref_costs[idx0+1], current[17] );\
}\
if( mby+1 < height )\
{\
if( mbx < width )\
MC_CLIP_ADD( ref_costs[idx2+0], current[32] );\
if( mbx+1 < width )\
MC_CLIP_ADD( ref_costs[idx2+1], current[33] );\
}\
}\
}\
}\
}
#define x264_plane_copy_c x264_template(plane_copy_c)
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define PLANE_COPY(align, cpu)\
static void plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align) / SIZEOF_PIXEL - 1;\
if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
else if( !(w&c_w) )\
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
else\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
}\
/* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
memcpy( dst, src, w*SIZEOF_PIXEL );\
}\
}
#define x264_plane_copy_swap_c x264_template(plane_copy_swap_c)
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define PLANE_COPY_SWAP(align, cpu)\
static void plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align>>1) / SIZEOF_PIXEL - 1;\
if( !(w&c_w) )\
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
else if( w > c_w )\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
}\
x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
{\
dst[x] = src[x+1];\
dst[x+1] = src[x];\
}\
}\
else\
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
}
#define x264_plane_copy_deinterleave_c x264_template(plane_copy_deinterleave_c)
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
/* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV
* input with the additional constraint that we cannot overread src. */
#define PLANE_COPY_YUYV(align, cpu)\
static void plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\
pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align>>1) / SIZEOF_PIXEL - 1;\
if( !(w&c_w) )\
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
else if( w > c_w )\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
dsta += i_dsta * h;\
dstb += i_dstb * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\
src+i_src, i_src, w, h );\
}\
x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\
}\
else\
x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
}
#define x264_plane_copy_interleave_c x264_template(plane_copy_interleave_c)
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
#define PLANE_INTERLEAVE(cpu) \
static void plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
pixel *srcu, intptr_t i_srcu,\
pixel *srcv, intptr_t i_srcv, int w, int h )\
{\
int c_w = 16 / SIZEOF_PIXEL - 1;\
if( !(w&c_w) )\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
{\
if( --h > 0 )\
{\
if( i_srcu > 0 )\
{\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
dst += i_dst * h;\
srcu += i_srcu * h;\
srcv += i_srcv * h;\
}\
else\
x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
}\
x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
}\
else\
x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
}
struct x264_weight_t;
typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
typedef struct x264_weight_t
{
/* aligning the first member is a gcc hack to force the struct to be
* 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
ALIGNED_16( int16_t cachea[8] );
int16_t cacheb[8];
int32_t i_denom;
int32_t i_scale;
int32_t i_offset;
weight_fn_t *weightfn;
} ALIGNED_16( x264_weight_t );
#define x264_weight_none ((const x264_weight_t*)x264_zero)
#define SET_WEIGHT( w, b, s, d, o )\
{\
(w).i_scale = (s);\
(w).i_denom = (d);\
(w).i_offset = (o);\
if( b )\
h->mc.weight_cache( h, &w );\
else\
w.weightfn = NULL;\
}
/* Do the MC
* XXX: Only width = 4, 8 or 16 are valid
* width == 4 -> height == 4 or 8
* width == 8 -> height == 4 or 8 or 16
* width == 16-> height == 8 or 16
* */
typedef struct
{
void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src,
int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
/* may round up the dimensions if they're not a power of 2 */
pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src,
int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,
int mvx, int mvy, int i_width, int i_height );
void (*avg[12])( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
pixel *src2, intptr_t src2_stride, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height );
void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_interleave)( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
/* may write up to 15 pixels off the end of each plane */
void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h );
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
intptr_t i_stride, int i_width, int i_height, int16_t *buf );
/* prefetch the next few macroblocks of fenc or fdec */
void (*prefetch_fenc) ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
void (*prefetch_fenc_400)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity );
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, size_t n );
/* successive elimination prefilter */
void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride );
void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride );
void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
void (*integral_init8v)( uint16_t *sum8, intptr_t stride );
void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height );
weight_fn_t *weight;
weight_fn_t *offsetadd;
weight_fn_t *offsetsub;
void (*weight_cache)( x264_t *, x264_weight_t * );
void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list );
void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count );
void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count );
} x264_mc_functions_t;
#define x264_mc_init x264_template(mc_init)
void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent );
#endif

526
common/mips/dct-c.c Normal file
View File

@@ -0,0 +1,526 @@
/*****************************************************************************
* dct-c.c: msa transform and zigzag
*****************************************************************************
* Copyright (C) 2015-2025 x264 project
*
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "macros.h"
#include "dct.h"
#if !HIGH_BIT_DEPTH
#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 ) \
{ \
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
tmp0_m = in0 + in2; \
tmp1_m = in0 - in2; \
tmp2_m = in1 >> 1; \
tmp2_m = tmp2_m - in3; \
tmp3_m = in3 >> 1; \
tmp3_m = in1 + tmp3_m; \
\
BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 ); \
}
static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
int32_t i_src_stride )
{
v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
UNPCK_R_SH_SW( src0, src0_r );
UNPCK_R_SH_SW( src1, src1_r );
UNPCK_R_SH_SW( src2, src2_r );
UNPCK_R_SH_SW( src3, src3_r );
BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
tmp0, tmp3, tmp2, tmp1 );
BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
hor_res0, hor_res3, hor_res2, hor_res1 );
TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
hor_res0, hor_res1, hor_res2, hor_res3 );
BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
tmp0, tmp3, tmp2, tmp1 );
BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
ver_res0, ver_res1, ver_res2, ver_res3 );
PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
ST_SH2( ver_res0, ver_res2, p_dst, 8 );
}
static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
uint8_t *p_ref, int32_t i_dst_stride,
int16_t *p_dst )
{
uint32_t i_src0, i_src1, i_src2, i_src3;
uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
v16i8 src = { 0 };
v16i8 ref = { 0 };
v16u8 inp0, inp1;
v8i16 diff0, diff1, diff2, diff3;
v8i16 temp0, temp1, temp2, temp3;
LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
ILVRL_B2_UB( src, ref, inp0, inp1 );
HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
diff0 = temp0 + temp1;
diff1 = ( temp3 << 1 ) + temp2;
diff2 = temp0 - temp1;
diff3 = temp3 - ( temp2 << 1 );
TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
temp0, temp1, temp2, temp3 );
BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
temp0 = diff0 + diff1;
temp1 = ( diff3 << 1 ) + diff2;
temp2 = diff0 - diff1;
temp3 = diff3 - ( diff2 << 1 );
ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
ST_UB2( inp0, inp1, p_dst, 8 );
}
static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
int16_t pi_level[16] )
{
v8i16 src0, src1;
v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
LD_SH2( pi_dct, 8, src0, src1 );
VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
ST_SH2( mask0, mask1, pi_level, 8 );
}
static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
int32_t i_dst_stride )
{
v8i16 src0, src1, src2, src3;
v8i16 hres0, hres1, hres2, hres3;
v8i16 vres0, vres1, vres2, vres3;
v8i16 zeros = { 0 };
LD4x4_SH( p_src, src0, src1, src2, src3 );
AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
hres0, hres1, hres2, hres3 );
AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
ST_SH2( zeros, zeros, p_src, 8 );
}
static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
int32_t i_dst_stride )
{
int16_t i_dc;
uint32_t i_src0, i_src1, i_src2, i_src3;
v16u8 pred = { 0 };
v16i8 out;
v8i16 input_dc, pred_r, pred_l;
i_dc = ( p_src[0] + 32 ) >> 6;
input_dc = __msa_fill_h( i_dc );
p_src[ 0 ] = 0;
LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
UNPCK_UB_SH( pred, pred_r, pred_l );
pred_r += input_dc;
pred_l += input_dc;
CLIP_SH2_0_255( pred_r, pred_l );
out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
}
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
int32_t i_dst_stride )
{
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
v8i16 vec0, vec1, vec2, vec3;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v16i8 zeros = { 0 };
p_src[ 0 ] += 32;
LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
vec0 = src0 + src4;
vec1 = src0 - src4;
vec2 = src2 >> 1;
vec2 = vec2 - src6;
vec3 = src6 >> 1;
vec3 = src2 + vec3;
BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
vec0 = src7 >> 1;
vec0 = src5 - vec0 - src3 - src7;
vec1 = src3 >> 1;
vec1 = src1 - vec1 + src7 - src3;
vec2 = src5 >> 1;
vec2 = vec2 - src1 + src7 + src5;
vec3 = src1 >> 1;
vec3 = vec3 + src3 + src5 + src1;
tmp4 = vec3 >> 2;
tmp4 += vec0;
tmp5 = vec2 >> 2;
tmp5 += vec1;
tmp6 = vec1 >> 2;
tmp6 -= vec2;
tmp7 = vec0 >> 2;
tmp7 = vec3 - tmp7;
BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
res0, res1, res2, res3, res4, res5, res6, res7 );
TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
res0, res1, res2, res3, res4, res5, res6, res7 );
UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
vec0_r, vec0_l, vec1_l, vec1_r );
vec2_r = tmp2_r >> 1;
vec2_l = tmp2_l >> 1;
vec2_r -= tmp6_r;
vec2_l -= tmp6_l;
vec3_r = tmp6_r >> 1;
vec3_l = tmp6_l >> 1;
vec3_r += tmp2_r;
vec3_l += tmp2_l;
BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
tmp0_r, tmp2_r, tmp4_r, tmp6_r );
BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
tmp0_l, tmp2_l, tmp4_l, tmp6_l );
vec0_r = tmp7_r >> 1;
vec0_l = tmp7_l >> 1;
vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
vec1_r = tmp3_r >> 1;
vec1_l = tmp3_l >> 1;
vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
vec2_r = tmp5_r >> 1;
vec2_l = tmp5_l >> 1;
vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
vec3_r = tmp1_r >> 1;
vec3_l = tmp1_l >> 1;
vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
tmp1_r = vec3_r >> 2;
tmp1_l = vec3_l >> 2;
tmp1_r += vec0_r;
tmp1_l += vec0_l;
tmp3_r = vec2_r >> 2;
tmp3_l = vec2_l >> 2;
tmp3_r += vec1_r;
tmp3_l += vec1_l;
tmp5_r = vec1_r >> 2;
tmp5_l = vec1_l >> 2;
tmp5_r -= vec2_r;
tmp5_l -= vec2_l;
tmp7_r = vec0_r >> 2;
tmp7_l = vec0_l >> 2;
tmp7_r = vec3_r - tmp7_r;
tmp7_l = vec3_l - tmp7_l;
BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
res0_r, res0_l, res7_l, res7_r );
BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
res1_r, res1_l, res6_l, res6_r );
BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
res2_r, res2_l, res5_l, res5_r );
BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
res3_r, res3_l, res4_l, res4_r );
SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
res0, res1, res2, res3 );
PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
res4, res5, res6, res7 );
LD_SB8( p_dst, i_dst_stride,
dst0, dst1, dst2, dst3,
dst4, dst5, dst6, dst7 );
ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
tmp0, tmp1, tmp2, tmp3 );
ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
tmp4, tmp5, tmp6, tmp7 );
ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
res0, res1, res2, res3 );
ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
res4, res5, res6, res7 );
CLIP_SH4_0_255( res0, res1, res2, res3 );
CLIP_SH4_0_255( res4, res5, res6, res7 );
PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
dst0, dst1, dst2, dst3 );
ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
p_dst += ( 4 * i_dst_stride );
ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
}
static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
int16_t *p_dst, int32_t i_dst_stride )
{
v8i16 src0, src1, src2, src3;
v4i32 src0_r, src1_r, src2_r, src3_r;
v4i32 hres0, hres1, hres2, hres3;
v8i16 vres0, vres1, vres2, vres3;
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v2i64 res0, res1;
LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
UNPCK_R_SH_SW( src0, src0_r );
UNPCK_R_SH_SW( src1, src1_r );
UNPCK_R_SH_SW( src2, src2_r );
UNPCK_R_SH_SW( src3, src3_r );
BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
hres0, hres1, hres2, hres3 );
BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
vres0, vres1, vres2, vres3 );
PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
}
static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
uint8_t *pred_ptr, int32_t i_pred_stride )
{
int16_t i_sum;
uint32_t i_src0, i_src1, i_src2, i_src3;
uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
v16i8 src = { 0 };
v16i8 pred = { 0 };
v16u8 src_l0, src_l1;
v8i16 diff0, diff1;
LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
ILVRL_B2_UB( src, pred, src_l0, src_l1 );
HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
i_sum = HADD_UH_U32( diff0 + diff1 );
return i_sum;
}
void x264_dct4x4dc_msa( int16_t d[16] )
{
avc_dct4x4dc_msa( d, d, 4 );
}
void x264_idct4x4dc_msa( int16_t d[16] )
{
avc_idct4x4dc_msa( d, 4, d, 4 );
}
void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
{
avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
}
void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
{
avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
&pi_dct[2][0], FDEC_STRIDE );
avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
&pi_dct[3][0], FDEC_STRIDE );
}
void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
{
x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
}
void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
{
avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
}
void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
{
avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
&pi_dct[2][0], FDEC_STRIDE );
avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
&pi_dct[3][0], FDEC_STRIDE );
}
void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
{
avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
&pi_dct[2], FDEC_STRIDE );
avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
&pi_dct[3], FDEC_STRIDE );
}
void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
{
for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
{
avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
}
}
void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
uint8_t *p_ref )
{
avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
}
void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
uint8_t *p_ref )
{
avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
&p_ref[0], FDEC_STRIDE, p_dst[0] );
avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
FDEC_STRIDE, p_dst[1] );
avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
FDEC_STRIDE, p_dst[2] );
avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
FDEC_STRIDE, p_dst[3] );
}
void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
uint8_t *p_src,
uint8_t *p_ref )
{
x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
&p_ref[8*FDEC_STRIDE+0] );
x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
&p_ref[8*FDEC_STRIDE+8] );
}
void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
uint8_t *p_pix1, uint8_t *p_pix2 )
{
int32_t d0, d1, d2, d3;
pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
&p_pix2[0], FDEC_STRIDE );
pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
&p_pix2[4], FDEC_STRIDE );
pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
&p_pix2[4 * FDEC_STRIDE + 0],
FDEC_STRIDE );
pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
&p_pix2[4 * FDEC_STRIDE + 4],
FDEC_STRIDE );
BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
}
void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
uint8_t *p_pix1, uint8_t *p_pix2 )
{
int32_t a0, a1, a2, a3, a4, a5, a6, a7;
int32_t b0, b1, b2, b3, b4, b5, b6, b7;
a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
&p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
&p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
&p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
&p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
&p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
&p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
&p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
&p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
b0, b1, b2, b3, b7, b6, b5, b4 );
BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
a0, a1, a2, a3, a7, a6, a5, a4 );
BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
}
void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
{
avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
}
#endif

64
common/mips/dct.h Normal file
View File

@@ -0,0 +1,64 @@
/*****************************************************************************
* dct.h: msa transform and zigzag
*****************************************************************************
* Copyright (C) 2015-2025 x264 project
*
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_MIPS_DCT_H
#define X264_MIPS_DCT_H
#define x264_dct4x4dc_msa x264_template(dct4x4dc_msa)
void x264_dct4x4dc_msa( int16_t d[16] );
#define x264_idct4x4dc_msa x264_template(idct4x4dc_msa)
void x264_idct4x4dc_msa( int16_t d[16] );
#define x264_add4x4_idct_msa x264_template(add4x4_idct_msa)
void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] );
#define x264_add8x8_idct_msa x264_template(add8x8_idct_msa)
void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] );
#define x264_add16x16_idct_msa x264_template(add16x16_idct_msa)
void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] );
#define x264_add8x8_idct8_msa x264_template(add8x8_idct8_msa)
void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] );
#define x264_add16x16_idct8_msa x264_template(add16x16_idct8_msa)
void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] );
#define x264_add8x8_idct_dc_msa x264_template(add8x8_idct_dc_msa)
void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] );
#define x264_add16x16_idct_dc_msa x264_template(add16x16_idct_dc_msa)
void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] );
#define x264_sub4x4_dct_msa x264_template(sub4x4_dct_msa)
void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
#define x264_sub8x8_dct_msa x264_template(sub8x8_dct_msa)
void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
uint8_t *p_ref );
#define x264_sub16x16_dct_msa x264_template(sub16x16_dct_msa)
void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src,
uint8_t *p_ref );
#define x264_sub8x8_dct_dc_msa x264_template(sub8x8_dct_dc_msa)
void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1,
uint8_t *p_pix2 );
#define x264_sub8x16_dct_dc_msa x264_template(sub8x16_dct_dc_msa)
void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1,
uint8_t *p_pix2 );
#define x264_zigzag_scan_4x4_frame_msa x264_template(zigzag_scan_4x4_frame_msa)
void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] );
#endif

2011
common/mips/deblock-c.c Normal file

File diff suppressed because it is too large Load Diff

52
common/mips/deblock.h Normal file
View File

@@ -0,0 +1,52 @@
/*****************************************************************************
* deblock.h: msa deblocking
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_MIPS_DEBLOCK_H
#define X264_MIPS_DEBLOCK_H
#if !HIGH_BIT_DEPTH
#define x264_deblock_v_luma_msa x264_template(deblock_v_luma_msa)
void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_msa x264_template(deblock_h_luma_msa)
void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_msa x264_template(deblock_v_chroma_msa)
void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_msa x264_template(deblock_h_chroma_msa)
void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_luma_intra_msa x264_template(deblock_v_luma_intra_msa)
void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_msa x264_template(deblock_h_luma_intra_msa)
void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_msa x264_template(deblock_v_chroma_intra_msa)
void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_msa x264_template(deblock_h_chroma_intra_msa)
void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_strength_msa x264_template(deblock_strength_msa)
void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
int bframe );
#endif
#endif

1952
common/mips/macros.h Normal file

File diff suppressed because it is too large Load Diff

3696
common/mips/mc-c.c Normal file

File diff suppressed because it is too large Load Diff

32
common/mips/mc.h Normal file
View File

@@ -0,0 +1,32 @@
/*****************************************************************************
* mc.h: msa motion compensation
*****************************************************************************
* Copyright (C) 2015-2025 x264 project
*
* Authors: Neha Rana <neha.rana@imgtec.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_MIPS_MC_H
#define X264_MIPS_MC_H
#define x264_mc_init_mips x264_template(mc_init_mips)
void x264_mc_init_mips( uint32_t cpu, x264_mc_functions_t *pf );
#endif

1491
common/mips/pixel-c.c Normal file

File diff suppressed because it is too large Load Diff

228
common/mips/pixel.h Normal file
View File

@@ -0,0 +1,228 @@
/*****************************************************************************
* pixel.h: msa pixel metrics
*****************************************************************************
* Copyright (C) 2015-2025 x264 project
*
* Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_MIPS_PIXEL_H
#define X264_MIPS_PIXEL_H
#define x264_pixel_sad_16x16_msa x264_template(pixel_sad_16x16_msa)
int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_16x8_msa x264_template(pixel_sad_16x8_msa)
int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x16_msa x264_template(pixel_sad_8x16_msa)
int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x8_msa x264_template(pixel_sad_8x8_msa)
int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_8x4_msa x264_template(pixel_sad_8x4_msa)
int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x16_msa x264_template(pixel_sad_4x16_msa)
int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x8_msa x264_template(pixel_sad_4x8_msa)
int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_4x4_msa x264_template(pixel_sad_4x4_msa)
int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_sad_x4_16x16_msa x264_template(pixel_sad_x4_16x16_msa)
void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_16x8_msa x264_template(pixel_sad_x4_16x8_msa)
void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x16_msa x264_template(pixel_sad_x4_8x16_msa)
void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x8_msa x264_template(pixel_sad_x4_8x8_msa)
void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_8x4_msa x264_template(pixel_sad_x4_8x4_msa)
void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_4x8_msa x264_template(pixel_sad_x4_4x8_msa)
void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x4_4x4_msa x264_template(pixel_sad_x4_4x4_msa)
void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
uint8_t *p_ref3, intptr_t i_ref_stride,
int32_t p_sad_array[4] );
#define x264_pixel_sad_x3_16x16_msa x264_template(pixel_sad_x3_16x16_msa)
void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_16x8_msa x264_template(pixel_sad_x3_16x8_msa)
void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x16_msa x264_template(pixel_sad_x3_8x16_msa)
void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x8_msa x264_template(pixel_sad_x3_8x8_msa)
void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_8x4_msa x264_template(pixel_sad_x3_8x4_msa)
void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_4x8_msa x264_template(pixel_sad_x3_4x8_msa)
void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_sad_x3_4x4_msa x264_template(pixel_sad_x3_4x4_msa)
void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
uint8_t *p_ref1, uint8_t *p_ref2,
intptr_t i_ref_stride,
int32_t p_sad_array[3] );
#define x264_pixel_ssd_16x16_msa x264_template(pixel_ssd_16x16_msa)
int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_16x8_msa x264_template(pixel_ssd_16x8_msa)
int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x16_msa x264_template(pixel_ssd_8x16_msa)
int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x8_msa x264_template(pixel_ssd_8x8_msa)
int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_8x4_msa x264_template(pixel_ssd_8x4_msa)
int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x16_msa x264_template(pixel_ssd_4x16_msa)
int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x8_msa x264_template(pixel_ssd_4x8_msa)
int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_pixel_ssd_4x4_msa x264_template(pixel_ssd_4x4_msa)
int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
uint8_t *p_ref, intptr_t i_ref_stride );
#define x264_intra_sad_x3_4x4_msa x264_template(intra_sad_x3_4x4_msa)
void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_16x16_msa x264_template(intra_sad_x3_16x16_msa)
void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_8x8_msa x264_template(intra_sad_x3_8x8_msa)
void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_intra_sad_x3_8x8c_msa x264_template(intra_sad_x3_8x8c_msa)
void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_ssim_4x4x2_core_msa x264_template(ssim_4x4x2_core_msa)
void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
const uint8_t *p_pix2, intptr_t i_stride2,
int32_t i_sums[2][4] );
#define x264_pixel_hadamard_ac_8x8_msa x264_template(pixel_hadamard_ac_8x8_msa)
uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_8x16_msa x264_template(pixel_hadamard_ac_8x16_msa)
uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x8_msa x264_template(pixel_hadamard_ac_16x8_msa)
uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_hadamard_ac_16x16_msa x264_template(pixel_hadamard_ac_16x16_msa)
uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_satd_4x4_msa x264_template(pixel_satd_4x4_msa)
int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x8_msa x264_template(pixel_satd_4x8_msa)
int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_4x16_msa x264_template(pixel_satd_4x16_msa)
int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x4_msa x264_template(pixel_satd_8x4_msa)
int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x8_msa x264_template(pixel_satd_8x8_msa)
int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_8x16_msa x264_template(pixel_satd_8x16_msa)
int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x8_msa x264_template(pixel_satd_16x8_msa)
int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_satd_16x16_msa x264_template(pixel_satd_16x16_msa)
int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sa8d_8x8_msa x264_template(pixel_sa8d_8x8_msa)
int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_pixel_sa8d_16x16_msa x264_template(pixel_sa8d_16x16_msa)
int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
uint8_t *p_pix2, intptr_t i_stride2 );
#define x264_intra_satd_x3_4x4_msa x264_template(intra_satd_x3_4x4_msa)
void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_16x16_msa x264_template(intra_satd_x3_16x16_msa)
void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_intra_sa8d_x3_8x8_msa x264_template(intra_sa8d_x3_8x8_msa)
void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
int32_t p_sad_array[3] );
#define x264_intra_satd_x3_8x8c_msa x264_template(intra_satd_x3_8x8c_msa)
void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
int32_t p_sad_array[3] );
#define x264_pixel_var_16x16_msa x264_template(pixel_var_16x16_msa)
uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var_8x16_msa x264_template(pixel_var_8x16_msa)
uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var_8x8_msa x264_template(pixel_var_8x8_msa)
uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
#define x264_pixel_var2_8x16_msa x264_template(pixel_var2_8x16_msa)
int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
uint8_t *p_pix2, intptr_t i_stride2,
int32_t *p_ssd );
#define x264_pixel_var2_8x8_msa x264_template(pixel_var2_8x8_msa)
int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
uint8_t *p_pix2, intptr_t i_stride2,
int32_t *p_ssd );
#endif

608
common/mips/predict-c.c Normal file
View File

@@ -0,0 +1,608 @@
/*****************************************************************************
* predict-c.c: msa intra prediction
*****************************************************************************
* Copyright (C) 2015-2025 x264 project
*
* Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "macros.h"
#include "predict.h"
#if !HIGH_BIT_DEPTH
static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
int32_t i_dst_stride )
{
uint32_t u_src_data;
u_src_data = LW( p_src );
SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
}
static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
int32_t i_dst_stride )
{
uint64_t u_out;
u_out = LD( p_src );
SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
p_dst += ( 4 * i_dst_stride );
SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
}
static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
int32_t i_dst_stride )
{
v16u8 src0 = LD_UB( p_src );
ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
i_dst_stride );
p_dst += ( 8 * i_dst_stride );
ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
i_dst_stride );
}
static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
uint8_t *p_dst, int32_t i_dst_stride )
{
uint32_t u_out0, u_out1, u_out2, u_out3;
u_out0 = p_src[0 * i_src_stride] * 0x01010101;
u_out1 = p_src[1 * i_src_stride] * 0x01010101;
u_out2 = p_src[2 * i_src_stride] * 0x01010101;
u_out3 = p_src[3 * i_src_stride] * 0x01010101;
SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
}
static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
uint8_t *p_dst, int32_t i_dst_stride )
{
uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
p_dst += ( 4 * i_dst_stride );
SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
}
static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
uint8_t *p_dst,
int32_t i_dst_stride )
{
uint32_t u_row;
uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
v16u8 src0, src1, src2, src3;
for( u_row = 4; u_row--; )
{
u_inp0 = p_src[0];
p_src += i_src_stride;
u_inp1 = p_src[0];
p_src += i_src_stride;
u_inp2 = p_src[0];
p_src += i_src_stride;
u_inp3 = p_src[0];
p_src += i_src_stride;
src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
p_dst += ( 4 * i_dst_stride );
}
}
static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
int32_t i_src_stride_left,
uint8_t *p_dst, int32_t i_dst_stride,
uint8_t is_above, uint8_t is_left )
{
uint32_t u_row;
uint32_t u_out, u_addition = 0;
v16u8 src_above, store;
v8u16 sum_above;
v4u32 sum;
if( is_left && is_above )
{
src_above = LD_UB( p_src_top );
sum_above = __msa_hadd_u_h( src_above, src_above );
sum = __msa_hadd_u_w( sum_above, sum_above );
u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
for( u_row = 0; u_row < 4; u_row++ )
{
u_addition += p_src_left[u_row * i_src_stride_left];
}
u_addition = ( u_addition + 4 ) >> 3;
store = ( v16u8 ) __msa_fill_b( u_addition );
}
else if( is_left )
{
for( u_row = 0; u_row < 4; u_row++ )
{
u_addition += p_src_left[u_row * i_src_stride_left];
}
u_addition = ( u_addition + 2 ) >> 2;
store = ( v16u8 ) __msa_fill_b( u_addition );
}
else if( is_above )
{
src_above = LD_UB( p_src_top );
sum_above = __msa_hadd_u_h( src_above, src_above );
sum = __msa_hadd_u_w( sum_above, sum_above );
sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
}
else
{
store = ( v16u8 ) __msa_ldi_b( 128 );
}
u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
}
static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
uint8_t *p_dst, int32_t i_dst_stride )
{
uint64_t u_val0, u_val1;
v16i8 store;
v16u8 src = { 0 };
v8u16 sum_h;
v4u32 sum_w;
v2u64 sum_d;
u_val0 = LD( p_src_top );
u_val1 = LD( p_src_left );
INSERT_D2_UB( u_val0, u_val1, src );
sum_h = __msa_hadd_u_h( src, src );
sum_w = __msa_hadd_u_w( sum_h, sum_h );
sum_d = __msa_hadd_u_d( sum_w, sum_w );
sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
sum_d = __msa_hadd_u_d( sum_w, sum_w );
sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
p_dst += ( 4 * i_dst_stride );
SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
}
static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
int32_t i_src_stride_left,
uint8_t *p_dst, int32_t i_dst_stride,
uint8_t is_above, uint8_t is_left )
{
uint32_t u_row;
uint32_t u_addition = 0;
v16u8 src_above, store;
v8u16 sum_above;
v4u32 sum_top;
v2u64 sum;
if( is_left && is_above )
{
src_above = LD_UB( p_src_top );
sum_above = __msa_hadd_u_h( src_above, src_above );
sum_top = __msa_hadd_u_w( sum_above, sum_above );
sum = __msa_hadd_u_d( sum_top, sum_top );
sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
sum = __msa_hadd_u_d( sum_top, sum_top );
u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
for( u_row = 0; u_row < 16; u_row++ )
{
u_addition += p_src_left[u_row * i_src_stride_left];
}
u_addition = ( u_addition + 16 ) >> 5;
store = ( v16u8 ) __msa_fill_b( u_addition );
}
else if( is_left )
{
for( u_row = 0; u_row < 16; u_row++ )
{
u_addition += p_src_left[u_row * i_src_stride_left];
}
u_addition = ( u_addition + 8 ) >> 4;
store = ( v16u8 ) __msa_fill_b( u_addition );
}
else if( is_above )
{
src_above = LD_UB( p_src_top );
sum_above = __msa_hadd_u_h( src_above, src_above );
sum_top = __msa_hadd_u_w( sum_above, sum_above );
sum = __msa_hadd_u_d( sum_top, sum_top );
sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
sum = __msa_hadd_u_d( sum_top, sum_top );
sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
}
else
{
store = ( v16u8 ) __msa_ldi_b( 128 );
}
ST_UB8( store, store, store, store, store, store, store, store, p_dst,
i_dst_stride );
p_dst += ( 8 * i_dst_stride );
ST_UB8( store, store, store, store, store, store, store, store, p_dst,
i_dst_stride );
}
static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
{
uint8_t u_lpcnt;
int32_t i_res, i_res0, i_res1, i_res2, i_res3;
uint64_t u_out0, u_out1;
v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
v4i32 int_multiplier = { 0, 1, 2, 3 };
v16u8 p_src_top;
v8i16 vec9, vec10, vec11;
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
v2i64 sum;
p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
( v16i8 ) p_src_top );
vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
vec9 *= short_multiplier;
vec8 = __msa_hadd_s_w( vec9, vec9 );
sum = __msa_hadd_s_d( vec8, vec8 );
i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
i_res0 *= 17;
i_res1 *= 17;
i_res0 = ( i_res0 + 16 ) >> 5;
i_res1 = ( i_res1 + 16 ) >> 5;
i_res3 = 3 * ( i_res0 + i_res1 );
i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
i_res = i_res2 - i_res3;
vec8 = __msa_fill_w( i_res0 );
vec4 = __msa_fill_w( i_res );
vec2 = __msa_fill_w( i_res1 );
vec5 = vec8 * int_multiplier;
vec3 = vec8 * 4;
for( u_lpcnt = 4; u_lpcnt--; )
{
vec0 = vec5;
vec0 += vec4;
vec1 = vec0 + vec3;
vec6 = vec5;
vec4 += vec2;
vec6 += vec4;
vec7 = vec6 + vec3;
SRA_4V( vec0, vec1, vec6, vec7, 5 );
PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
CLIP_SH2_0_255( vec10, vec11 );
PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
SD( u_out0, p_src );
p_src += i_stride;
SD( u_out1, p_src );
p_src += i_stride;
vec4 += vec2;
}
}
static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
{
uint8_t u_lpcnt;
int32_t i_res0, i_res1, i_res2, i_res3;
uint64_t u_load0, u_load1;
v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
v4i32 int_multiplier = { 0, 1, 2, 3 };
v16u8 p_src_top = { 0 };
v8i16 vec9, vec10;
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
u_load0 = LD( p_src - ( i_stride + 1 ) );
u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
INSERT_D2_UB( u_load0, u_load1, p_src_top );
p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
( v16i8 ) p_src_top );
vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
vec9 *= short_multiplier;
vec8 = __msa_hadd_s_w( vec9, vec9 );
res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
i_res0 *= 5;
i_res1 *= 5;
i_res0 = ( i_res0 + 32 ) >> 6;
i_res1 = ( i_res1 + 32 ) >> 6;
i_res3 = 7 * ( i_res0 + i_res1 );
i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
i_res2 -= i_res3;
vec8 = __msa_fill_w( i_res0 );
vec4 = __msa_fill_w( i_res2 );
vec5 = __msa_fill_w( i_res1 );
vec6 = vec8 * 4;
vec7 = vec8 * int_multiplier;
for( u_lpcnt = 16; u_lpcnt--; )
{
vec0 = vec7;
vec0 += vec4;
vec1 = vec0 + vec6;
vec2 = vec1 + vec6;
vec3 = vec2 + vec6;
SRA_4V( vec0, vec1, vec2, vec3, 5 );
PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
CLIP_SH2_0_255( vec9, vec10 );
PCKEV_ST_SB( vec9, vec10, p_src );
p_src += i_stride;
vec4 += vec5;
}
}
static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
{
uint8_t u_lp_cnt;
uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
uint32_t u_out0, u_out1, u_out2, u_out3;
v16u8 p_src_top;
v8u16 add;
v4u32 sum;
p_src_top = LD_UB( p_src - i_stride );
add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
sum = __msa_hadd_u_w( add, add );
u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
{
u_src0 += p_src[u_lp_cnt * i_stride - 1];
u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
}
u_src0 = ( u_src0 + 4 ) >> 3;
u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
u_src1 = ( u_src1 + 2 ) >> 2;
u_src2 = ( u_src2 + 2 ) >> 2;
u_out0 = u_src0 * 0x01010101;
u_out1 = u_src1 * 0x01010101;
u_out2 = u_src2 * 0x01010101;
u_out3 = u_src3 * 0x01010101;
for( u_lp_cnt = 4; u_lp_cnt--; )
{
SW( u_out0, p_src );
SW( u_out1, ( p_src + 4 ) );
SW( u_out2, ( p_src + 4 * i_stride ) );
SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
p_src += i_stride;
}
}
static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
int32_t i_dst_stride )
{
uint8_t u_src_val = p_src[15];
uint64_t u_out0, u_out1, u_out2, u_out3;
v16u8 src, vec4, vec5, res0;
v8u16 vec0, vec1, vec2, vec3;
v2i64 res1, res2, res3;
src = LD_UB( p_src );
vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
vec0 += vec1;
vec2 += vec3;
vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
u_out1 = __msa_copy_u_d( res1, 0 );
u_out2 = __msa_copy_u_d( res2, 0 );
u_out3 = __msa_copy_u_d( res3, 0 );
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
p_dst += ( 4 * i_dst_stride );
res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
u_out1 = __msa_copy_u_d( res1, 0 );
u_out2 = __msa_copy_u_d( res2, 0 );
u_out3 = __msa_copy_u_d( res3, 0 );
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
}
static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
int32_t i_dst_stride )
{
v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
p_dst += ( 8 * i_dst_stride );
ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
}
void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
{
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
}
void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
{
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
}
void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
{
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
}
void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
{
intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
}
void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
{
intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
p_src, FDEC_STRIDE );
}
void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
{
intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
}
void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
{
intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
}
void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
{
intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
}
void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
{
intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
p_src, FDEC_STRIDE );
}
void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
{
intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
}
void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
{
intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
}
void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
{
intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
}
void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
{
intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
p_src, FDEC_STRIDE );
}
void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
{
intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
}
void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
{
intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
}
void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
{
intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
}
void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
{
intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
p_src, FDEC_STRIDE );
}
void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
{
intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
}
#endif

Some files were not shown because too many files have changed in this diff Show More