x264 source for verification 2026-05-22

2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,51 @@
 *~
 *.a
 *.d
 *.diff
 *.orig
 *.rej
 *.dll*
 *.exe
 *.def
 *.lib
 *.pdb
 *.mo
 *.o
 *.patch
 *.pc
 *.pot
 *.so*
 *.dylib
 .*.swp
 .depend
 .DS_Store
 TAGS
 config.h
 config.mak
 config.log
 x264_config.h
 x264
 checkasm
 *.264
 *.h264
 *.2pass
 *.ffindex
 *.avs
 *.mkv
 *.flv
 *.mp4
 *.y4m
 *.yuv
 *.log
 *.mbtree
 *.temp
 *.pyc
 *.pgd
 *.pgc
 .digress_x264
 dataDec.txt
 log.dec
 common/oclobj.h
 x264_lookahead.clbin
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -0,0 +1,339 @@
 stages:
    - build
    - test
    - release
 .variables-debian-amd64: &variables-debian-amd64
    _TRIPLET: ""
    _PLATFORMSUFFIX: ""
    _WRAPPER: ""
 .variables-debian-aarch64: &variables-debian-aarch64
    _TRIPLET: ""
    _PLATFORMSUFFIX: ""
    _WRAPPER: ""
 .variables-win32: &variables-win32
    _TRIPLET: "i686-w64-mingw32"
    _ARCH: "i686"
    _OS: "mingw32"
    _PLATFORMSUFFIX: ".exe"
    _WRAPPER: "wine"
 .variables-win64: &variables-win64
    _TRIPLET: "x86_64-w64-mingw32"
    _ARCH: "x86_64"
    _OS: "mingw32"
    _PLATFORMSUFFIX: ".exe"
    _WRAPPER: "wine"
 .variables-win-armv7: &variables-win-armv7
    _TRIPLET: "armv7-w64-mingw32"
    _PLATFORMSUFFIX: ".exe"
    _WRAPPER: ""
 .variables-win-aarch64: &variables-win-aarch64
    _TRIPLET: "aarch64-w64-mingw32"
    _PLATFORMSUFFIX: ".exe"
    _WRAPPER: ""
 .variables-macos-x86_64: &variables-macos-x86_64
    _TRIPLET: "x86_64-apple-darwin19"
    _ARCH: "x86_64"
    _OS: "darwin"
    _PLATFORMSUFFIX: ""
    _WRAPPER: ""
    _XCFLAGS: "-arch x86_64"
    _XLDFLAGS: "-arch x86_64"
    _BIN_PATH: /Users/videolanci/sandbox/bin
 .variables-macos-arm64: &variables-macos-arm64
    _TRIPLET: "aarch64-apple-darwin19"
    _ARCH: "aarch64"
    _OS: "darwin"
    _PLATFORMSUFFIX: ""
    _WRAPPER: ""
    _XCFLAGS: "-arch arm64"
    _XLDFLAGS: "-arch arm64"
    _BIN_PATH: /Users/videolanci/sandbox/bin
 .variables-android-arm: &variables-android-arm
    _TRIPLET: "arm-linux-androideabi"
    _CLANG_TRIPLET: "armv7a-linux-androideabi"
    _ANDROID_VERSION: "21"
    _PLATFORMSUFFIX: ""
    _WRAPPER: ""
 .variables-android-aarch64: &variables-android-aarch64
    _TRIPLET: "aarch64-linux-android"
    _CLANG_TRIPLET: "aarch64-linux-android"
    _ANDROID_VERSION: "21"
    _PLATFORMSUFFIX: ""
    _WRAPPER: ""
 .build:
    stage: build
    script: |
        set -x
        LOCAL_INSTALL_DIR=`pwd`/local_install
        export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
        git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
        cd ffmpeg
        ./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers --extra-ldflags="-static"
        make -j$(getconf _NPROCESSORS_ONLN)
        make -j$(getconf _NPROCESSORS_ONLN) install
        cd ..
        git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
        cd lsmash
        ./configure --prefix="${LOCAL_INSTALL_DIR}" --extra-ldflags="-static"
        make -j$(getconf _NPROCESSORS_ONLN)
        make -j$(getconf _NPROCESSORS_ONLN) install
        cd ..
        ./configure --enable-pic --enable-strip --extra-ldflags="-static"
        make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
    artifacts:
        name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA"
        paths:
            - x264${_PLATFORMSUFFIX}
            - checkasm8${_PLATFORMSUFFIX}
            - checkasm10${_PLATFORMSUFFIX}
            - config.log
        expire_in: 1 week
 build-debian-amd64:
    extends: .build
    image: registry.videolan.org/vlc-debian-unstable:20240212151604
    tags:
        - docker
        - amd64
    variables: *variables-debian-amd64
 build-debian-aarch64:
    extends: .build
    image: registry.videolan.org/x264-debian-unstable-aarch64:20211206141032
    tags:
        - docker
        - aarch64
    variables: *variables-debian-aarch64
 .build-win:
    extends: build-debian-amd64
    image: registry.videolan.org/vlc-debian-llvm-msvcrt:20240212151604
    script: |
        set -x
        LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET}
        export PKGCONFIG=pkg-config
        export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
        git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
        cd ffmpeg
        ./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --cross-prefix="${_TRIPLET}-" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers
        make -j$(getconf _NPROCESSORS_ONLN)
        make -j$(getconf _NPROCESSORS_ONLN) install
        cd ..
        git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
        cd lsmash
        ./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --cross-prefix="${_TRIPLET}-"
        make -j$(getconf _NPROCESSORS_ONLN)
        make -j$(getconf _NPROCESSORS_ONLN) install
        cd ..
        ./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip
        make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
 build-win32:
    extends: .build-win
    variables: *variables-win32
 build-win64:
    extends: .build-win
    variables: *variables-win64
 .build-llvm-mingw:
    extends: .build
    image: registry.videolan.org/vlc-debian-llvm-ucrt:20240212151604
    tags:
        - docker
        - amd64
    script: |
        set -x
        PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip
        make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
 build-llvm-mingw-armv7:
    extends: .build-llvm-mingw
    variables: *variables-win-armv7
 build-llvm-mingw-aarch64:
    extends: .build-llvm-mingw
    variables: *variables-win-aarch64
 .build-macos:
    extends: .build
    script: |
        set -x
        export PATH="${_BIN_PATH}:$PATH"
        LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET}
        export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig
        git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg
        cd ffmpeg
        ./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers
        make -j$(getconf _NPROCESSORS_ONLN)
        make -j$(getconf _NPROCESSORS_ONLN) install
        cd ..
        git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash
        cd lsmash
        ./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}"
        make -j$(getconf _NPROCESSORS_ONLN)
        make -j$(getconf _NPROCESSORS_ONLN) install
        cd ..
        ./configure --host="${_TRIPLET}" --enable-pic --enable-strip
        make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
 build-macos-x86_64:
    extends: .build-macos
    tags:
        - amd64
        - monterey
    variables: *variables-macos-x86_64
 build-macos-arm64:
    extends: .build-macos
    tags:
        - amd64
        - monterey
    variables: *variables-macos-arm64
 .build-android:
    extends: .build
    image: registry.videolan.org/vlc-debian-android:20241118101328
    tags:
        - docker
        - amd64
    script: |
        set -x
        CC=${_CLANG_TRIPLET}${_ANDROID_VERSION}-clang AR=llvm-ar RANLIB=llvm-ranlib STRIP=llvm-strip PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --enable-pic --enable-strip
        make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm
 build-android-arm:
    extends: .build-android
    variables: *variables-android-arm
 build-android-aarch64:
    extends: .build-android
    variables: *variables-android-aarch64
 .test: &test
    stage: test
    script: |
        set -x
        ${_WRAPPER} ./checkasm8${_PLATFORMSUFFIX}
        ${_WRAPPER} ./checkasm10${_PLATFORMSUFFIX}
    artifacts:
        expire_in: 10 minutes
 test-debian-amd64:
    <<: *test
    extends: build-debian-amd64
    dependencies:
        - build-debian-amd64
    variables: *variables-debian-amd64
 test-debian-aarch64:
    <<: *test
    extends: build-debian-aarch64
    dependencies:
        - build-debian-aarch64
    variables: *variables-debian-aarch64
 test-win32:
    <<: *test
    extends: build-win32
    dependencies:
        - build-win32
    variables: *variables-win32
 test-win64:
    <<: *test
    extends: build-win64
    dependencies:
        - build-win64
    variables: *variables-win64
 test-macos-x86_64:
    <<: *test
    extends: build-macos-x86_64
    dependencies:
        - build-macos-x86_64
    variables: *variables-macos-x86_64
 test-aarch64-qemu:
    <<: *test
    extends: build-debian-amd64
    image: registry.videolan.org/x264-debian-unstable:20231113190916
    dependencies:
        - build-debian-aarch64
    variables: *variables-debian-amd64
    script: |
        set -x
        for size in 128 256 512 1024 2048; do
            for tool in checkasm8 checkasm10; do
                qemu-aarch64 -cpu max,sve-default-vector-length=256,sve$size=on -L /usr/aarch64-linux-gnu ./$tool
            done
        done
 .release: &release
    stage: release
    script: |
        set -x
        _VERSION=$(./version.sh | grep _VERSION -| cut -d\  -f4-| sed 's, ,-,g' | sed 's,",,')
        mv x264${_PLATFORMSUFFIX} x264-${_VERSION}${_PLATFORMSUFFIX}
    when: manual
    only:
        - master@videolan/x264
        - stable@videolan/x264
    artifacts:
        name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA"
        paths:
            - x264-*${_PLATFORMSUFFIX}
        expire_in: '10 minutes'
 release-debian-amd64:
    <<: *release
    extends: build-debian-amd64
    dependencies:
        - build-debian-amd64
    variables: *variables-debian-amd64
 release-debian-aarch64:
    <<: *release
    extends: build-debian-aarch64
    dependencies:
        - build-debian-aarch64
    variables: *variables-debian-aarch64
 release-win32:
    <<: *release
    extends: build-win32
    dependencies:
        - build-win32
    variables: *variables-win32
 release-win64:
    <<: *release
    extends: build-win64
    dependencies:
        - build-win64
    variables: *variables-win64
 release-macos-x86_64:
    <<: *release
    extends: build-macos-x86_64
    dependencies:
        - build-macos-x86_64
    variables: *variables-macos-x86_64
 release-macos-arm64:
    <<: *release
    extends: build-macos-arm64
    dependencies:
        - build-macos-arm64
    variables: *variables-macos-arm64
--- a/99
+++ b/99
@@ -0,0 +1,99 @@
 # Contributors to x264
 #
 # The format of this file was inspired by the Linux kernel CREDITS file.
 # Authors are listed alphabetically.
 #
 # The fields are: name (N), email (E), web-address (W), CVS account login (C),
 # PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
 N: Alex Izvorski
 E: aizvorski AT gmail DOT com
 D: x86 asm (sse2)
 N: Alex Wright
 E: alexw0885 AT gmail DOT com
 D: Motion estimation (subpel and mixed refs)
 D: B-RDO
 N: bobololo
 D: Avisynth input
 D: MP4 muxing
 N: Christian Heine
 E: sennindemokrit AT gmx DOT net
 D: x86 asm
 N: David Wolstencroft
 D: Altivec optimizations
 N: Eric Petit
 E: eric.petit AT lapsus DOT org
 C: titer
 D: Altivec asm
 D: BeOS and MacOS X ports.
 S: France
 N: Fiona Glaser
 E: fiona AT x264 DOT com
 D: Maintainer
 D: All areas of encoder analysis and algorithms
 D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
 D: x86 asm
 S: USA
 N: Gabriel Bouvigne
 E: bouvigne AT mp3-tech DOT org
 D: 2pass VBV
 N: Guillaume Poirier
 E: gpoirier CHEZ mplayerhq POINT hu
 D: Altivec optimizations
 S: Brittany, France
 N: Henrik Gramner
 E: henrik AT gramner DOT com
 D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes
 S: Sweden
 N: Laurent Aimar
 E: fenrir AT videolan DOT org
 C: fenrir
 D: Initial import, former maintainer
 D: x86 asm (mmx/mmx2)
 S: France
 N: Loren Merritt
 E: pengvado AT akuvian DOT org
 C: pengvado
 D: Maintainer
 D: All areas of encoder analysis and algorithms
 D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc
 D: Multithreading
 D: x86 asm
 S: USA
 N: Mans Rullgard
 E: mru AT mansr DOT com
 C: mru
 D: Rate control
 S: Southampton, UK
 N: Michael Niedermayer
 E: michaelni AT gmx DOT at
 D: Rate control
 N: Mike Matsnev
 E: mike AT po DOT cs DOT msu DOT su
 D: Matroska muxing
 N: Min Chen
 E: chenm001 AT 163 DOT com
 C: chenm001
 D: Win32/VC 6.0 port
 D: gcc asm to nasm conversion
 S: China
 N: Radek Czyz
 E: radoslaw AT syskin DOT cjb DOT net
 D: Cached motion compensation
--- a/340
+++ b/340
@@ -0,0 +1,340 @@
 		    GNU GENERAL PUBLIC LICENSE
 		       Version 2, June 1991
 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
 			    Preamble
  The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
 License is intended to guarantee your freedom to share and change free
 software--to make sure the software is free for all its users.  This
 General Public License applies to most of the Free Software
 Foundation's software and to any other program whose authors commit to
 using it.  (Some other Free Software Foundation software is covered by
 the GNU Library General Public License instead.)  You can apply it to
 your programs, too.
  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
 this service if you wish), that you receive source code or can get it
 if you want it, that you can change the software or use pieces of it
 in new free programs; and that you know you can do these things.
  To protect your rights, we need to make restrictions that forbid
 anyone to deny you these rights or to ask you to surrender the rights.
 These restrictions translate to certain responsibilities for you if you
 distribute copies of the software, or if you modify it.
  For example, if you distribute copies of such a program, whether
 gratis or for a fee, you must give the recipients all the rights that
 you have.  You must make sure that they, too, receive or can get the
 source code.  And you must show them these terms so they know their
 rights.
  We protect your rights with two steps: (1) copyright the software, and
 (2) offer you this license which gives you legal permission to copy,
 distribute and/or modify the software.
  Also, for each author's protection and ours, we want to make certain
 that everyone understands that there is no warranty for this free
 software.  If the software is modified by someone else and passed on, we
 want its recipients to know that what they have is not the original, so
 that any problems introduced by others will not reflect on the original
 authors' reputations.
  Finally, any free program is threatened constantly by software
 patents.  We wish to avoid the danger that redistributors of a free
 program will individually obtain patent licenses, in effect making the
 program proprietary.  To prevent this, we have made it clear that any
 patent must be licensed for everyone's free use or not licensed at all.
  The precise terms and conditions for copying, distribution and
 modification follow.
 		    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
  0. This License applies to any program or other work which contains
 a notice placed by the copyright holder saying it may be distributed
 under the terms of this General Public License.  The "Program", below,
 refers to any such program or work, and a "work based on the Program"
 means either the Program or any derivative work under copyright law:
 that is to say, a work containing the Program or a portion of it,
 either verbatim or with modifications and/or translated into another
 language.  (Hereinafter, translation is included without limitation in
 the term "modification".)  Each licensee is addressed as "you".
 Activities other than copying, distribution and modification are not
 covered by this License; they are outside its scope.  The act of
 running the Program is not restricted, and the output from the Program
 is covered only if its contents constitute a work based on the
 Program (independent of having been made by running the Program).
 Whether that is true depends on what the Program does.
  1. You may copy and distribute verbatim copies of the Program's
 source code as you receive it, in any medium, provided that you
 conspicuously and appropriately publish on each copy an appropriate
 copyright notice and disclaimer of warranty; keep intact all the
 notices that refer to this License and to the absence of any warranty;
 and give any other recipients of the Program a copy of this License
 along with the Program.
 You may charge a fee for the physical act of transferring a copy, and
 you may at your option offer warranty protection in exchange for a fee.
  2. You may modify your copy or copies of the Program or any portion
 of it, thus forming a work based on the Program, and copy and
 distribute such modifications or work under the terms of Section 1
 above, provided that you also meet all of these conditions:
    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.
    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.
    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)
 These requirements apply to the modified work as a whole.  If
 identifiable sections of that work are not derived from the Program,
 and can be reasonably considered independent and separate works in
 themselves, then this License, and its terms, do not apply to those
 sections when you distribute them as separate works.  But when you
 distribute the same sections as part of a whole which is a work based
 on the Program, the distribution of the whole must be on the terms of
 this License, whose permissions for other licensees extend to the
 entire whole, and thus to each and every part regardless of who wrote it.
 Thus, it is not the intent of this section to claim rights or contest
 your rights to work written entirely by you; rather, the intent is to
 exercise the right to control the distribution of derivative or
 collective works based on the Program.
 In addition, mere aggregation of another work not based on the Program
 with the Program (or with a work based on the Program) on a volume of
 a storage or distribution medium does not bring the other work under
 the scope of this License.
  3. You may copy and distribute the Program (or a work based on it,
 under Section 2) in object code or executable form under the terms of
 Sections 1 and 2 above provided that you also do one of the following:
    a) Accompany it with the complete corresponding machine-readable
    source code, which must be distributed under the terms of Sections
    1 and 2 above on a medium customarily used for software interchange; or,
    b) Accompany it with a written offer, valid for at least three
    years, to give any third party, for a charge no more than your
    cost of physically performing source distribution, a complete
    machine-readable copy of the corresponding source code, to be
    distributed under the terms of Sections 1 and 2 above on a medium
    customarily used for software interchange; or,
    c) Accompany it with the information you received as to the offer
    to distribute corresponding source code.  (This alternative is
    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)
 The source code for a work means the preferred form of the work for
 making modifications to it.  For an executable work, complete source
 code means all the source code for all modules it contains, plus any
 associated interface definition files, plus the scripts used to
 control compilation and installation of the executable.  However, as a
 special exception, the source code distributed need not include
 anything that is normally distributed (in either source or binary
 form) with the major components (compiler, kernel, and so on) of the
 operating system on which the executable runs, unless that component
 itself accompanies the executable.
 If distribution of executable or object code is made by offering
 access to copy from a designated place, then offering equivalent
 access to copy the source code from the same place counts as
 distribution of the source code, even though third parties are not
 compelled to copy the source along with the object code.
  4. You may not copy, modify, sublicense, or distribute the Program
 except as expressly provided under this License.  Any attempt
 otherwise to copy, modify, sublicense or distribute the Program is
 void, and will automatically terminate your rights under this License.
 However, parties who have received copies, or rights, from you under
 this License will not have their licenses terminated so long as such
 parties remain in full compliance.
  5. You are not required to accept this License, since you have not
 signed it.  However, nothing else grants you permission to modify or
 distribute the Program or its derivative works.  These actions are
 prohibited by law if you do not accept this License.  Therefore, by
 modifying or distributing the Program (or any work based on the
 Program), you indicate your acceptance of this License to do so, and
 all its terms and conditions for copying, distributing or modifying
 the Program or works based on it.
  6. Each time you redistribute the Program (or any work based on the
 Program), the recipient automatically receives a license from the
 original licensor to copy, distribute or modify the Program subject to
 these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties to
 this License.
  7. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
 excuse you from the conditions of this License.  If you cannot
 distribute so as to satisfy simultaneously your obligations under this
 License and any other pertinent obligations, then as a consequence you
 may not distribute the Program at all.  For example, if a patent
 license would not permit royalty-free redistribution of the Program by
 all those who receive copies directly or indirectly through you, then
 the only way you could satisfy both it and this License would be to
 refrain entirely from distribution of the Program.
 If any portion of this section is held invalid or unenforceable under
 any particular circumstance, the balance of the section is intended to
 apply and the section as a whole is intended to apply in other
 circumstances.
 It is not the purpose of this section to induce you to infringe any
 patents or other property right claims or to contest validity of any
 such claims; this section has the sole purpose of protecting the
 integrity of the free software distribution system, which is
 implemented by public license practices.  Many people have made
 generous contributions to the wide range of software distributed
 through that system in reliance on consistent application of that
 system; it is up to the author/donor to decide if he or she is willing
 to distribute software through any other system and a licensee cannot
 impose that choice.
 This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
  8. If the distribution and/or use of the Program is restricted in
 certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Program under this License
 may add an explicit geographical distribution limitation excluding
 those countries, so that distribution is permitted only in or among
 countries not thus excluded.  In such case, this License incorporates
 the limitation as if written in the body of this License.
  9. The Free Software Foundation may publish revised and/or new versions
 of the General Public License from time to time.  Such new versions will
 be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.
 Each version is given a distinguishing version number.  If the Program
 specifies a version number of this License which applies to it and "any
 later version", you have the option of following the terms and conditions
 either of that version or of any later version published by the Free
 Software Foundation.  If the Program does not specify a version number of
 this License, you may choose any version ever published by the Free Software
 Foundation.
  10. If you wish to incorporate parts of the Program into other free
 programs whose distribution conditions are different, write to the author
 to ask for permission.  For software which is copyrighted by the Free
 Software Foundation, write to the Free Software Foundation; we sometimes
 make exceptions for this.  Our decision will be guided by the two goals
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
 			    NO WARRANTY
  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
 OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
 PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
 OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
 TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
 PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
 REPAIR OR CORRECTION.
  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
 REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
 INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
 OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
 TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
 YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
 PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGES.
 		     END OF TERMS AND CONDITIONS
 	    How to Apply These Terms to Your New Programs
  If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
 free software which everyone can redistribute and change under these terms.
  To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
 convey the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 Also add information on how to contact you by electronic and paper mail.
 If the program is interactive, make it output a short notice like this
 when it starts in an interactive mode:
    Gnomovision version 69, Copyright (C) year  name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.
 The hypothetical commands `show w' and `show c' should show the appropriate
 parts of the General Public License.  Of course, the commands you use may
 be called something other than `show w' and `show c'; they could even be
 mouse-clicks or menu items--whatever suits your program.
 You should also get your employer (if you work as a programmer) or your
 school, if any, to sign a "copyright disclaimer" for the program, if
 necessary.  Here is a sample; alter the names:
  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.
  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
 library.  If this is what you want to do, use the GNU Library General
 Public License instead of this License.
--- a/482
+++ b/482
@@ -0,0 +1,482 @@
 # Makefile
 include config.mak
 vpath %.c $(SRCPATH)
 vpath %.h $(SRCPATH)
 vpath %.S $(SRCPATH)
 vpath %.asm $(SRCPATH)
 vpath %.rc $(SRCPATH)
 vpath %.manifest $(SRCPATH)
 CFLAGS += $(CFLAGSPROF)
 LDFLAGS += $(LDFLAGSPROF)
 GENERATED =
 all: default
 default:
 SRCS = common/osdep.c common/base.c common/cpu.c common/tables.c \
       encoder/api.c
 SRCS_X = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
         common/frame.c common/dct.c common/cabac.c \
         common/common.c common/rectangle.c \
         common/set.c common/quant.c common/deblock.c common/vlc.c \
         common/mvpred.c common/bitstream.c \
         encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
         encoder/set.c encoder/macroblock.c encoder/cabac.c \
         encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
 SRCS_8 =
 SRCCLI = x264.c autocomplete.c input/input.c input/timecode.c input/raw.c \
         input/y4m.c output/raw.c output/matroska.c output/matroska_ebml.c \
         output/flv.c output/flv_bytestream.c filters/filters.c \
         filters/video/video.c filters/video/source.c filters/video/internal.c \
         filters/video/resize.c filters/video/fix_vfr_pts.c \
         filters/video/select_every.c filters/video/crop.c
 SRCCLI_X = filters/video/cache.c filters/video/depth.c
 SRCSO =
 SRCCHK_X = tools/checkasm.c
 SRCEXAMPLE = example.c
 OBJS =
 OBJASM =
 OBJSO =
 OBJCLI =
 OBJCHK =
 OBJCHK_8 =
 OBJCHK_10 =
 OBJEXAMPLE =
 CONFIG := $(shell cat config.h)
 # Optional module sources
 ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),)
 SRCCLI += input/avs.c
 endif
 ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),)
 SRCS_X   += common/threadpool.c
 SRCCLI_X += input/thread.c
 endif
 ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),)
 SRCS += common/win32thread.c
 endif
 ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),)
 SRCCLI += input/lavf.c
 endif
 ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),)
 SRCCLI += input/ffms.c
 endif
 ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),)
 SRCCLI += output/mp4.c
 endif
 ifneq ($(findstring HAVE_LSMASH 1, $(CONFIG)),)
 SRCCLI += output/mp4_lsmash.c
 endif
 ifneq ($(AS),)
 # MMX/SSE optims
 SRCASM_X =
 ifeq ($(SYS_ARCH),X86)
 ARCH_X86 = yes
 SRCASM_X += common/x86/dct-32.asm \
            common/x86/pixel-32.asm
 endif
 ifeq ($(SYS_ARCH),X86_64)
 ARCH_X86 = yes
 SRCASM_X += common/x86/dct-64.asm \
            common/x86/trellis-64.asm
 endif
 ifdef ARCH_X86
 SRCASM_X += common/x86/bitstream-a.asm \
            common/x86/const-a.asm \
            common/x86/cabac-a.asm \
            common/x86/dct-a.asm \
            common/x86/deblock-a.asm \
            common/x86/mc-a.asm \
            common/x86/mc-a2.asm \
            common/x86/pixel-a.asm \
            common/x86/predict-a.asm \
            common/x86/quant-a.asm
 SRCS_X   += common/x86/mc-c.c \
            common/x86/predict-c.c
 OBJASM += common/x86/cpu-a.o
 ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.asm=%-8.o) common/x86/sad-a-8.o
 endif
 ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.asm=%-10.o) common/x86/sad16-a-10.o
 endif
 OBJCHK += tools/checkasm-a.o
 endif
 # AltiVec optims
 ifeq ($(SYS_ARCH),PPC)
 SRCS_X += common/ppc/dct.c \
          common/ppc/deblock.c \
          common/ppc/mc.c \
          common/ppc/pixel.c \
          common/ppc/predict.c \
          common/ppc/quant.c
 endif
 # NEON optims
 ifeq ($(SYS_ARCH),ARM)
 SRCASM_X  = common/arm/bitstream-a.S \
            common/arm/dct-a.S \
            common/arm/deblock-a.S \
            common/arm/mc-a.S \
            common/arm/pixel-a.S \
            common/arm/predict-a.S \
            common/arm/quant-a.S
 SRCS_X   += common/arm/mc-c.c \
            common/arm/predict-c.c
 OBJASM += common/arm/cpu-a.o
 ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-8.o)
 endif
 ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-10.o)
 endif
 OBJCHK += tools/checkasm-arm.o
 endif
 # AArch64 NEON and SVE/SVE2 optims
 ifeq ($(SYS_ARCH),AARCH64)
 SRCASM_X  = common/aarch64/bitstream-a.S \
            common/aarch64/cabac-a.S \
            common/aarch64/dct-a.S \
            common/aarch64/deblock-a.S \
            common/aarch64/mc-a.S \
            common/aarch64/pixel-a.S \
            common/aarch64/predict-a.S \
            common/aarch64/quant-a.S
 ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),)
 SRCASM_X += common/aarch64/dct-a-sve.S \
            common/aarch64/deblock-a-sve.S \
            common/aarch64/mc-a-sve.S \
            common/aarch64/pixel-a-sve.S
 endif
 ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),)
 SRCASM_X += common/aarch64/dct-a-sve2.S
 endif
 SRCS_X   += common/aarch64/asm-offsets.c \
            common/aarch64/mc-c.c \
            common/aarch64/predict-c.c
 OBJASM +=
 ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-8.o)
 endif
 ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-10.o)
 endif
 OBJCHK += tools/checkasm-aarch64.o
 endif
 # RISCV64 RVV optims
 ifeq ($(SYS_ARCH),RISCV64)
 ifneq ($(findstring HAVE_RVV 1, $(CONFIG)),)
 SRCASM_X =
 SRCS_X  +=
 OBJASM +=
 ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-8.o)
 endif
 ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-10.o)
 endif
 OBJCHK +=
 endif
 endif
 # MSA optims
 ifeq ($(SYS_ARCH),MIPS)
 ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
 SRCS_X += common/mips/dct-c.c \
          common/mips/deblock-c.c \
          common/mips/mc-c.c \
          common/mips/pixel-c.c \
          common/mips/predict-c.c \
          common/mips/quant-c.c
 endif
 endif
 # LOONGARCH optimization
 ifeq ($(SYS_ARCH),LOONGARCH)
 ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),)
 SRCASM_X += common/loongarch/deblock-a.S \
            common/loongarch/sad-a.S \
            common/loongarch/predict-a.S \
            common/loongarch/quant-a.S \
            common/loongarch/mc-a.S \
            common/loongarch/dct-a.S \
            common/loongarch/pixel-a.S
 SRCS_X += common/loongarch/predict-c.c \
          common/loongarch/mc-c.c \
          common/loongarch/pixel-c.c
 OBJASM +=
 ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-8.o)
 endif
 ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
 OBJASM += $(SRCASM_X:%.S=%-10.o)
 endif
 OBJCHK += tools/checkasm-loongarch.o
 endif
 endif
 endif
 ifneq ($(HAVE_GETOPT_LONG),1)
 SRCCLI += extras/getopt.c
 endif
 ifeq ($(SYS),WINDOWS)
 OBJCLI += $(if $(RC), x264res.o)
 ifneq ($(SONAME),)
 SRCSO  += x264dll.c
 OBJSO  += $(if $(RC), x264res.dll.o)
 endif
 endif
 ifeq ($(HAVE_OPENCL),yes)
 common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
 	cat $^ | $(SRCPATH)/tools/cltostr.sh $@
 GENERATED += common/oclobj.h
 SRCS_8 += common/opencl.c encoder/slicetype-cl.c
 endif
 OBJS   += $(SRCS:%.c=%.o)
 OBJCLI += $(SRCCLI:%.c=%.o)
 OBJSO  += $(SRCSO:%.c=%.o)
 OBJEXAMPLE += $(SRCEXAMPLE:%.c=%.o)
 ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),)
 OBJS      += $(SRCS_X:%.c=%-8.o) $(SRCS_8:%.c=%-8.o)
 OBJCLI    += $(SRCCLI_X:%.c=%-8.o)
 OBJCHK_8  += $(SRCCHK_X:%.c=%-8.o)
 checkasm: checkasm8$(EXE)
 endif
 ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),)
 OBJS      += $(SRCS_X:%.c=%-10.o)
 OBJCLI    += $(SRCCLI_X:%.c=%-10.o)
 OBJCHK_10 += $(SRCCHK_X:%.c=%-10.o)
 checkasm: checkasm10$(EXE)
 endif
 .PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* checkasm etags
 cli: x264$(EXE)
 lib-static: $(LIBX264)
 lib-shared: $(SONAME)
 $(LIBX264): $(OBJS) $(OBJASM)
 	rm -f $(LIBX264)
 	$(AR)$@ $(OBJS) $(OBJASM)
 	$(if $(RANLIB), $(RANLIB) $@)
 $(SONAME): $(OBJS) $(OBJASM) $(OBJSO)
 	$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 $(IMPLIBNAME): $(SONAME)
 ifneq ($(EXE),)
 .PHONY: x264 checkasm8 checkasm10 example
 x264: x264$(EXE)
 checkasm8: checkasm8$(EXE)
 checkasm10: checkasm10$(EXE)
 example: example$(EXE)
 endif
 x264$(EXE): $(OBJCLI) $(CLI_LIBX264)
 	$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
 checkasm8$(EXE): $(OBJCHK) $(OBJCHK_8) $(LIBX264)
 	$(LD)$@ $(OBJCHK) $(OBJCHK_8) $(LIBX264) $(LDFLAGS)
 checkasm10$(EXE): $(OBJCHK) $(OBJCHK_10) $(LIBX264)
 	$(LD)$@ $(OBJCHK) $(OBJCHK_10) $(LIBX264) $(LDFLAGS)
 example$(EXE): $(OBJEXAMPLE) $(LIBX264)
 	$(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS)
 $(OBJS) $(OBJSO): CFLAGS += $(CFLAGSSO)
 $(OBJCLI): CFLAGS += $(CFLAGSCLI)
 ALLOBJS = $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10) $(OBJEXAMPLE)
 $(ALLOBJS): $(GENERATED)
 %.o: %.c
 	$(DEPCMD)
 	$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS)
 %-8.o: %.c
 	$(DEPCMD)
 	$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
 %-10.o: %.c
 	$(DEPCMD)
 	$(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
 %.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
 	$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d)
 	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
 %-8.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
 	$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=8 -Dprivate_prefix=x264_8
 	-@ $(if $(STRIP), $(STRIP) -x $@)
 %-10.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
 	$(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=10 -Dprivate_prefix=x264_10
 	-@ $(if $(STRIP), $(STRIP) -x $@)
 %.o: %.S
 	$(DEPCMD)
 	$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS)
 	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
 %-8.o: %.S
 	$(DEPCMD)
 	$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
 	-@ $(if $(STRIP), $(STRIP) -x $@)
 %-10.o: %.S
 	$(DEPCMD)
 	$(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
 	-@ $(if $(STRIP), $(STRIP) -x $@)
 %.dll.o: %.rc x264.h
 	$(RC) $(RCFLAGS)$@ -DDLL $<
 %.o: %.rc x264.h x264res.manifest
 	$(RC) $(RCFLAGS)$@ $<
 config.mak:
 	./configure
 # This is kept as a no-op
 depend:
 	@echo "make depend" is handled implicitly now
 -include $(wildcard $(ALLOBJS:.o=.d))
 # Dummy rule to avoid failing, if the dependency files specify dependencies on
 # a removed .h file.
 %.h:
 	@:
 OBJPROF = $(OBJS) $(OBJSO) $(OBJCLI)
 # These should cover most of the important codepaths
 OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
 OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0  --slice-max-mbs 50
 OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500
 OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree
 OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4
 OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2
 OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall
 OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac
 ifeq (,$(VIDS))
 fprofiled:
 	@echo 'usage: make fprofiled VIDS="infile1 infile2 ..."'
 	@echo 'where infiles are anything that x264 understands,'
 	@echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.'
 else
 fprofiled: clean
 	$(MAKE) x264$(EXE) CFLAGSPROF="$(PROF_GEN_CC)" LDFLAGSPROF="$(PROF_GEN_LD)"
 	$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
 ifeq ($(COMPILER),CL)
 # Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted
 	rm -f x264$(EXE)
 else
 	rm -f $(OBJPROF)
 endif
 	$(MAKE) CFLAGSPROF="$(PROF_USE_CC)" LDFLAGSPROF="$(PROF_USE_LD)"
 	rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 endif
 clean:
 	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(GENERATED) TAGS
 	rm -f $(SONAME) *.a *.lib *.exp *.pdb x264$(EXE) x264_lookahead.clbin
 	rm -f checkasm8$(EXE) checkasm10$(EXE) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10)
 	rm -f example$(EXE) $(OBJEXAMPLE)
 	rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 	rm -f $(ALLOBJS:%.o=%.d)
 distclean: clean
 	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
 	rm -rf conftest*
 install-cli: cli
 	$(INSTALL) -d $(DESTDIR)$(bindir)
 	$(INSTALL) x264$(EXE) $(DESTDIR)$(bindir)
 install-lib-dev:
 	$(INSTALL) -d $(DESTDIR)$(includedir)
 	$(INSTALL) -d $(DESTDIR)$(libdir)/pkgconfig
 	$(INSTALL) -m 644 $(SRCPATH)/x264.h x264_config.h $(DESTDIR)$(includedir)
 	$(INSTALL) -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
 install-lib-static: lib-static install-lib-dev
 	$(INSTALL) -d $(DESTDIR)$(libdir)
 	$(INSTALL) -m 644 $(LIBX264) $(DESTDIR)$(libdir)
 	$(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264))
 install-lib-shared: lib-shared install-lib-dev
 	$(INSTALL) -d $(DESTDIR)$(libdir)
 ifneq ($(IMPLIBNAME),)
 	$(INSTALL) -d $(DESTDIR)$(bindir)
 	$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(bindir)
 	$(INSTALL) -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir)
 else ifneq ($(SONAME),)
 	ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
 	$(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(libdir)
 endif
 install-bashcompletion:
 ifneq ($(BASHCOMPLETIONSDIR),)
 	$(INSTALL) -d $(DESTDIR)$(BASHCOMPLETIONSDIR)
 	$(INSTALL) -m 644 $(SRCPATH)/tools/bash-autocomplete.sh $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264
 endif
 uninstall:
 	rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(includedir)/x264_config.h $(DESTDIR)$(libdir)/libx264.a
 	rm -f $(DESTDIR)$(bindir)/x264$(EXE) $(DESTDIR)$(libdir)/pkgconfig/x264.pc
 ifneq ($(IMPLIBNAME),)
 	rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME)
 else ifneq ($(SONAME),)
 	rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX)
 endif
 ifneq ($(BASHCOMPLETIONSDIR),)
 	rm -f $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264
 endif
 etags TAGS:
 	etags $(SRCS) $(SRCS_X) $(SRCS_8)
--- a/autocomplete.c
+++ b/autocomplete.c
@@ -0,0 +1,408 @@
 /*****************************************************************************
 * autocomplete: x264cli shell autocomplete
 *****************************************************************************
 * Copyright (C) 2018-2025 x264 project
 *
 * Authors: Henrik Gramner <henrik@gramner.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "x264cli.h"
 #include "input/input.h"
 #if HAVE_LAVF
 #undef DECLARE_ALIGNED
 #include <libavformat/avformat.h>
 #include <libavutil/pixdesc.h>
 #endif
 static const char * const level_names[] =
 {
    "1", "1.1", "1.2", "1.3", "1b",
    "2", "2.1", "2.2",
    "3", "3.1", "3.2",
    "4", "4.1", "4.2",
    "5", "5.1", "5.2",
    "6", "6.1", "6.2",
    NULL
 };
 /* Options requiring a value for which we provide suggestions. */
 static const char * const opts_suggest[] =
 {
    "--alternative-transfer",
    "--aq-mode",
    "--asm",
    "--avcintra-class",
    "--avcintra-flavor",
    "--b-adapt",
    "--b-pyramid",
    "--colormatrix",
    "--colorprim",
    "--cqm",
    "--demuxer",
    "--direct",
    "--frame-packing",
    "--input-csp",
    "--input-fmt",
    "--input-range",
    "--level",
    "--log-level",
    "--me",
    "--muxer",
    "--nal-hrd",
    "--output-csp",
    "--overscan",
    "--pass", "-p",
    "--preset",
    "--profile",
    "--pulldown",
    "--range",
    "--subme", "-m",
    "--transfer",
    "--trellis", "-t",
    "--tune",
    "--videoformat",
    "--weightp",
    NULL
 };
 /* Options requiring a value for which we don't provide suggestions. */
 static const char * const opts_nosuggest[] =
 {
    "--b-bias",
    "--bframes", "-b",
    "--deblock", "-f",
    "--bitrate", "-B",
    "--chroma-qp-offset",
    "--chromaloc",
    "--cplxblur",
    "--cqm4",
    "--cqm4i",
    "--cqm4ic",
    "--cqm4iy",
    "--cqm4p",
    "--cqm4pc",
    "--cqm4py",
    "--cqm8",
    "--cqm8i",
    "--cqm8p",
    "--crf",
    "--crf-max",
    "--crop-rect",
    "--deadzone-inter",
    "--deadzone-intra",
    "--fps",
    "--frames",
    "--input-depth",
    "--input-res",
    "--ipratio",
    "--keyint", "-I",
    "--lookahead-threads",
    "--mastering-display",
    "--cll",
    "--merange",
    "--min-keyint", "-i",
    "--mvrange",
    "--mvrange-thread",
    "--nr",
    "--opencl-device",
    "--output-depth",
    "--partitions", "-A",
    "--pbratio",
    "--psy-rd",
    "--qblur",
    "--qcomp",
    "--qp", "-q",
    "--qpmax",
    "--qpmin",
    "--qpstep",
    "--ratetol",
    "--ref", "-r",
    "--rc-lookahead",
    "--sar",
    "--scenecut",
    "--seek",
    "--slices",
    "--slices-max",
    "--slice-max-size",
    "--slice-max-mbs",
    "--slice-min-mbs",
    "--sps-id",
    "--sync-lookahead",
    "--threads",
    "--timebase",
    "--vbv-bufsize",
    "--vbv-init",
    "--vbv-maxrate",
    "--video-filter", "--vf",
    "--zones",
    NULL
 };
 /* Options requiring a filename. */
 static const char * const opts_filename[] =
 {
    "--cqmfile",
    "--dump-yuv",
    "--index",
    "--opencl-clbin",
    "--output", "-o",
    "--qpfile",
    "--stats",
    "--tcfile-in",
    "--tcfile-out",
    NULL
 };
 /* Options without an associated value. */
 static const char * const opts_standalone[] =
 {
    "--8x8dct",
    "--aud",
    "--bff",
    "--bluray-compat",
    "--cabac",
    "--constrained-intra",
    "--cpu-independent",
    "--dts-compress",
    "--fake-interlaced",
    "--fast-pskip",
    "--filler",
    "--force-cfr",
    "--mbtree",
    "--mixed-refs",
    "--no-8x8dct",
    "--no-asm",
    "--no-cabac",
    "--no-chroma-me",
    "--no-dct-decimate",
    "--no-deblock",
    "--no-fast-pskip",
    "--no-mbtree",
    "--no-mixed-refs",
    "--no-progress",
    "--no-psy",
    "--no-scenecut",
    "--no-weightb",
    "--non-deterministic",
    "--open-gop",
    "--opencl",
    "--pic-struct",
    "--psnr",
    "--quiet",
    "--sliced-threads",
    "--slow-firstpass",
    "--ssim",
    "--stitchable",
    "--tff",
    "--thread-input",
    "--verbose", "-v",
    "--weightb",
    NULL
 };
 /* Options which shouldn't be suggested in combination with other options. */
 static const char * const opts_special[] =
 {
    "--fullhelp",
    "--help", "-h",
    "--longhelp",
    "--version",
    NULL
 };
 static int list_contains( const char * const *list, const char *s )
 {
    if( *s )
        for( ; *list; list++ )
            if( !strcmp( *list, s ) )
                return 1;
    return 0;
 }
 static void suggest( const char *s, const char *cur, int cur_len )
 {
    if( s && *s && !strncmp( s, cur, cur_len ) )
        printf( "%s ", s );
 }
 static void suggest_lower( const char *s, const char *cur, int cur_len )
 {
    if( s && *s && !strncasecmp( s, cur, cur_len ) )
    {
        for( ; *s; s++ )
            putchar( *s < 'A' || *s > 'Z' ? *s : *s | 0x20 );
        putchar( ' ' );
    }
 }
 static void suggest_num_range( int start, int end, const char *cur, int cur_len )
 {
    char buf[16];
    for( int i = start; i <= end; i++ )
    {
        snprintf( buf, sizeof( buf ), "%d", i );
        suggest( buf, cur, cur_len );
    }
 }
 #if HAVE_LAVF
 /* Suggest each token in a string separated by delimiters. */
 static void suggest_token( const char *s, int delim, const char *cur, int cur_len )
 {
    if( s && *s )
    {
        for( const char *tok_end; (tok_end = strchr( s, delim )); s = tok_end + 1 )
        {
            int tok_len = tok_end - s;
            if( tok_len && tok_len >= cur_len && !strncmp( s, cur, cur_len ) )
                printf( "%.*s ", tok_len, s );
        }
        suggest( s, cur, cur_len );
    }
 }
 #endif
 #define OPT( opt ) else if( !strcmp( prev, opt ) )
 #define OPT2( opt1, opt2 ) else if( !strcmp( prev, opt1 ) || !strcmp( prev, opt2 ) )
 #define OPT_TYPE( type ) list_contains( opts_##type, prev )
 #define suggest( s ) suggest( s, cur, cur_len )
 #define suggest_lower( s ) suggest_lower( s, cur, cur_len )
 #define suggest_list( list ) for( const char * const *s = list; *s; s++ ) suggest( *s )
 #define suggest_num_range( start, end ) suggest_num_range( start, end, cur, cur_len )
 #define suggest_token( s, delim ) suggest_token( s, delim, cur, cur_len )
 int x264_cli_autocomplete( const char *prev, const char *cur )
 {
    int cur_len = strlen( cur );
    if( 0 );
    OPT( "--alternative-transfer" )
        suggest_list( x264_transfer_names );
    OPT( "--aq-mode" )
        suggest_num_range( 0, 3 );
    OPT( "--asm" )
        for( const x264_cpu_name_t *cpu = x264_cpu_names; cpu->flags; cpu++ )
            suggest_lower( cpu->name );
    OPT( "--avcintra-class" )
        suggest_list( x264_avcintra_class_names );
    OPT( "--avcintra-flavor" )
        suggest_list( x264_avcintra_flavor_names );
    OPT( "--b-adapt" )
        suggest_num_range( 0, 2 );
    OPT( "--b-pyramid" )
        suggest_list( x264_b_pyramid_names );
    OPT( "--colormatrix" )
        suggest_list( x264_colmatrix_names );
    OPT( "--colorprim" )
        suggest_list( x264_colorprim_names );
    OPT( "--cqm" )
        suggest_list( x264_cqm_names );
    OPT( "--demuxer" )
        suggest_list( x264_demuxer_names );
    OPT( "--direct" )
        suggest_list( x264_direct_pred_names );
    OPT( "--frame-packing" )
        suggest_num_range( 0, 7 );
    OPT( "--input-csp" )
    {
        for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
            suggest( x264_cli_csps[i].name );
 #if HAVE_LAVF
        for( const AVPixFmtDescriptor *d = NULL; (d = av_pix_fmt_desc_next( d )); )
            suggest( d->name );
 #endif
    }
    OPT( "--input-fmt" )
    {
 #if HAVE_LAVF
        void *i = NULL;
        for( const AVInputFormat *f; (f = av_demuxer_iterate( &i )); )
            suggest_token( f->name, ',' );
 #endif
    }
    OPT( "--input-range" )
        suggest_list( x264_range_names );
    OPT( "--level" )
        suggest_list( level_names );
    OPT( "--log-level" )
        suggest_list( x264_log_level_names );
    OPT( "--me" )
        suggest_list( x264_motion_est_names );
    OPT( "--muxer" )
        suggest_list( x264_muxer_names );
    OPT( "--nal-hrd" )
        suggest_list( x264_nal_hrd_names );
    OPT( "--output-csp" )
        suggest_list( x264_output_csp_names );
    OPT( "--output-depth" )
    {
 #if HAVE_BITDEPTH8
        suggest( "8" );
 #endif
 #if HAVE_BITDEPTH10
        suggest( "10" );
 #endif
    }
    OPT( "--overscan" )
        suggest_list( x264_overscan_names );
    OPT2( "--partitions", "-A" )
        suggest_list( x264_partition_names );
    OPT2( "--pass", "-p" )
        suggest_num_range( 1, 3 );
    OPT( "--preset" )
        suggest_list( x264_preset_names );
    OPT( "--profile" )
        suggest_list( x264_valid_profile_names );
    OPT( "--pulldown" )
        suggest_list( x264_pulldown_names );
    OPT( "--range" )
        suggest_list( x264_range_names );
    OPT2( "--subme", "-m" )
        suggest_num_range( 0, 11 );
    OPT( "--transfer" )
        suggest_list( x264_transfer_names );
    OPT2( "--trellis", "-t" )
        suggest_num_range( 0, 2 );
    OPT( "--tune" )
        suggest_list( x264_tune_names );
    OPT( "--videoformat" )
        suggest_list( x264_vidformat_names );
    OPT( "--weightp" )
        suggest_num_range( 0, 2 );
    else if( !OPT_TYPE( nosuggest ) && !OPT_TYPE( special ) )
    {
        if( OPT_TYPE( filename ) || strncmp( cur, "--", 2 ) )
            return 1; /* Fall back to default shell filename autocomplete. */
        /* Suggest options. */
        suggest_list( opts_suggest );
        suggest_list( opts_nosuggest );
        suggest_list( opts_filename );
        suggest_list( opts_standalone );
        /* Only suggest special options if no other options have been specified. */
        if( !*prev )
            suggest_list( opts_special );
    }
    putchar( '\n' );
    return 0;
 }
--- a/common/aarch64/asm-offsets.c
+++ b/common/aarch64/asm-offsets.c
@@ -0,0 +1,56 @@
 /*****************************************************************************
 * asm-offsets.c: check asm offsets for aarch64
 *****************************************************************************
 * Copyright (C) 2014-2025 x264 project
 *
 * Authors: Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "asm-offsets.h"
 #define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
 #define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
 { \
    STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
 }
 #define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
 { \
    STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
 }
 X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
 X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
 X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
 X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
 X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
 X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
 X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
 X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
 X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);
 // the aarch64 asm makes following additional assumptions about the x264_cabac_t
 // memory layout
 X264_CHECK_REL_OFFSET(x264_cabac_t, i_low,    int, i_range);
 X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue,  int, i_bytes_outstanding);
--- a/common/aarch64/asm-offsets.h
+++ b/common/aarch64/asm-offsets.h
@@ -0,0 +1,39 @@
 /*****************************************************************************
 * asm-offsets.h: asm offsets for aarch64
 *****************************************************************************
 * Copyright (C) 2014-2025 x264 project
 *
 * Authors: Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_ASM_OFFSETS_H
 #define X264_AARCH64_ASM_OFFSETS_H
 #define CABAC_I_LOW                 0x00
 #define CABAC_I_RANGE               0x04
 #define CABAC_I_QUEUE               0x08
 #define CABAC_I_BYTES_OUTSTANDING   0x0c
 #define CABAC_P_START               0x10
 #define CABAC_P                     0x18
 #define CABAC_P_END                 0x20
 #define CABAC_F8_BITS_ENCODED       0x30
 #define CABAC_STATE                 0x34
 #endif
--- a/common/aarch64/asm.S
+++ b/common/aarch64/asm.S
@@ -0,0 +1,291 @@
 /*****************************************************************************
 * asm.S: AArch64 utility macros
 *****************************************************************************
 * Copyright (C) 2008-2025 x264 project
 *
 * Authors: Mans Rullgard <mans@mansr.com>
 *          David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "config.h"
 #define GLUE(a, b) a ## b
 #define JOIN(a, b) GLUE(a, b)
 #ifdef PREFIX
 #   define BASE _x264_
 #   define SYM_PREFIX _
 #else
 #   define BASE x264_
 #   define SYM_PREFIX
 #endif
 #ifdef BIT_DEPTH
 #   define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
 #else
 #   define EXTERN_ASM BASE
 #endif
 #define X(s) JOIN(EXTERN_ASM, s)
 #define X264(s) JOIN(BASE, s)
 #define EXT(s) JOIN(SYM_PREFIX, s)
 #ifdef __ELF__
 #   define ELF
 #else
 #   define ELF  #
 #endif
 #ifdef __MACH__
 #   define MACH
 #else
 #   define MACH #
 #endif
 #if HAVE_AS_FUNC
 #   define FUNC
 #else
 #   define FUNC #
 #endif
        .arch AS_ARCH_LEVEL
 #if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
 #define ENABLE_DOTPROD  .arch_extension dotprod
 #define DISABLE_DOTPROD .arch_extension nodotprod
 #else
 #define ENABLE_DOTPROD
 #define DISABLE_DOTPROD
 #endif
 #if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
 #define ENABLE_I8MM  .arch_extension i8mm
 #define DISABLE_I8MM .arch_extension noi8mm
 #else
 #define ENABLE_I8MM
 #define DISABLE_I8MM
 #endif
 #if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
 #define ENABLE_SVE  .arch_extension sve
 #define DISABLE_SVE .arch_extension nosve
 #else
 #define ENABLE_SVE
 #define DISABLE_SVE
 #endif
 #if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
 #define ENABLE_SVE2  .arch_extension sve2
 #define DISABLE_SVE2 .arch_extension nosve2
 #else
 #define ENABLE_SVE2
 #define DISABLE_SVE2
 #endif
 /* If we do support the .arch_extension directives, disable support for all
 * the extensions that we may use, in case they were implicitly enabled by
 * the .arch level. This makes it clear if we try to assemble an instruction
 * from an unintended extension set; we only allow assmbling such instructions
 * within regions where we explicitly enable those extensions. */
 DISABLE_DOTPROD
 DISABLE_I8MM
 DISABLE_SVE
 DISABLE_SVE2
 .macro  function name, export=0, align=2
    .macro endfunc
 .if \export
 ELF     .size   EXTERN_ASM\name, . - EXTERN_ASM\name
 .else
 ELF     .size   \name, . - \name
 .endif
 FUNC    .endfunc
        .purgem endfunc
    .endm
        .text
        .align          \align
    .if \export
        .global EXTERN_ASM\name
 ELF     .type   EXTERN_ASM\name, %function
 FUNC    .func   EXTERN_ASM\name
 EXTERN_ASM\name:
    .else
 ELF     .type   \name, %function
 FUNC    .func   \name
 \name:
    .endif
 .endm
 .macro  const   name, align=2
    .macro endconst
 ELF     .size   \name, . - \name
        .purgem endconst
    .endm
 ELF     .section        .rodata
 MACH    .const_data
        .align          \align
 \name:
 .endm
 .macro  movrel rd, val, offset=0
 #if defined(__APPLE__)
  .if \offset < 0
        adrp            \rd, \val@PAGE
        add             \rd, \rd, \val@PAGEOFF
        sub             \rd, \rd, -(\offset)
  .else
        adrp            \rd, \val+(\offset)@PAGE
        add             \rd, \rd, \val+(\offset)@PAGEOFF
  .endif
 #elif defined(PIC) && defined(_WIN32)
  .if \offset < 0
        adrp            \rd, \val
        add             \rd, \rd, :lo12:\val
        sub             \rd, \rd, -(\offset)
  .else
        adrp            \rd, \val+(\offset)
        add             \rd, \rd, :lo12:\val+(\offset)
  .endif
 #elif defined(PIC)
        adrp            \rd, \val+(\offset)
        add             \rd, \rd, :lo12:\val+(\offset)
 #else
        ldr             \rd, =\val+\offset
 #endif
 .endm
 #define FDEC_STRIDE 32
 #define FENC_STRIDE 16
 .macro SUMSUB_AB   sum, sub, a, b
    add         \sum,  \a,  \b
    sub         \sub,  \a,  \b
 .endm
 .macro unzip t1, t2, s1, s2
    uzp1        \t1,  \s1,  \s2
    uzp2        \t2,  \s1,  \s2
 .endm
 .macro transpose t1, t2, s1, s2
    trn1        \t1,  \s1,  \s2
    trn2        \t2,  \s1,  \s2
 .endm
 .macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
    transpose   \t0\().2s,  \t2\().2s,  \v0\().2s,  \v2\().2s
    transpose   \t1\().2s,  \t3\().2s,  \v1\().2s,  \v3\().2s
    transpose   \v0\().4h,  \v1\().4h,  \t0\().4h,  \t1\().4h
    transpose   \v2\().4h,  \v3\().4h,  \t2\().4h,  \t3\().4h
 .endm
 .macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
    transpose   \t0\().4s,  \t2\().4s,  \v0\().4s,  \v2\().4s
    transpose   \t1\().4s,  \t3\().4s,  \v1\().4s,  \v3\().4s
    transpose   \v0\().8h,  \v1\().8h,  \t0\().8h,  \t1\().8h
    transpose   \v2\().8h,  \v3\().8h,  \t2\().8h,  \t3\().8h
 .endm
 .macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
    trn1        \r8\().8h,  \r0\().8h,  \r1\().8h
    trn2        \r9\().8h,  \r0\().8h,  \r1\().8h
    trn1        \r1\().8h,  \r2\().8h,  \r3\().8h
    trn2        \r3\().8h,  \r2\().8h,  \r3\().8h
    trn1        \r0\().8h,  \r4\().8h,  \r5\().8h
    trn2        \r5\().8h,  \r4\().8h,  \r5\().8h
    trn1        \r2\().8h,  \r6\().8h,  \r7\().8h
    trn2        \r7\().8h,  \r6\().8h,  \r7\().8h
    trn1        \r4\().4s,  \r0\().4s,  \r2\().4s
    trn2        \r2\().4s,  \r0\().4s,  \r2\().4s
    trn1        \r6\().4s,  \r5\().4s,  \r7\().4s
    trn2        \r7\().4s,  \r5\().4s,  \r7\().4s
    trn1        \r5\().4s,  \r9\().4s,  \r3\().4s
    trn2        \r9\().4s,  \r9\().4s,  \r3\().4s
    trn1        \r3\().4s,  \r8\().4s,  \r1\().4s
    trn2        \r8\().4s,  \r8\().4s,  \r1\().4s
    trn1        \r0\().2d,  \r3\().2d,  \r4\().2d
    trn2        \r4\().2d,  \r3\().2d,  \r4\().2d
    trn1        \r1\().2d,  \r5\().2d,  \r6\().2d
    trn2        \r5\().2d,  \r5\().2d,  \r6\().2d
    trn2        \r6\().2d,  \r8\().2d,  \r2\().2d
    trn1        \r2\().2d,  \r8\().2d,  \r2\().2d
    trn1        \r3\().2d,  \r9\().2d,  \r7\().2d
    trn2        \r7\().2d,  \r9\().2d,  \r7\().2d
 .endm
 .macro  transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
    trn1        \t0\().16b, \r0\().16b, \r1\().16b
    trn2        \t1\().16b, \r0\().16b, \r1\().16b
    trn1        \r1\().16b, \r2\().16b, \r3\().16b
    trn2        \r3\().16b, \r2\().16b, \r3\().16b
    trn1        \r0\().16b, \r4\().16b, \r5\().16b
    trn2        \r5\().16b, \r4\().16b, \r5\().16b
    trn1        \r2\().16b, \r6\().16b, \r7\().16b
    trn2        \r7\().16b, \r6\().16b, \r7\().16b
    trn1        \r4\().8h,  \r0\().8h,  \r2\().8h
    trn2        \r2\().8h,  \r0\().8h,  \r2\().8h
    trn1        \r6\().8h,  \r5\().8h,  \r7\().8h
    trn2        \r7\().8h,  \r5\().8h,  \r7\().8h
    trn1        \r5\().8h,  \t1\().8h,  \r3\().8h
    trn2        \t1\().8h,  \t1\().8h,  \r3\().8h
    trn1        \r3\().8h,  \t0\().8h,  \r1\().8h
    trn2        \t0\().8h,  \t0\().8h,  \r1\().8h
    trn1        \r0\().4s,  \r3\().4s,  \r4\().4s
    trn2        \r4\().4s,  \r3\().4s,  \r4\().4s
    trn1        \r1\().4s,  \r5\().4s,  \r6\().4s
    trn2        \r5\().4s,  \r5\().4s,  \r6\().4s
    trn2        \r6\().4s,  \t0\().4s,  \r2\().4s
    trn1        \r2\().4s,  \t0\().4s,  \r2\().4s
    trn1        \r3\().4s,  \t1\().4s,  \r7\().4s
    trn2        \r7\().4s,  \t1\().4s,  \r7\().4s
 .endm
 .macro  transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
    trn1        \t4\().16b, \r0\().16b,  \r1\().16b
    trn2        \t5\().16b, \r0\().16b,  \r1\().16b
    trn1        \t6\().16b, \r2\().16b,  \r3\().16b
    trn2        \t7\().16b, \r2\().16b,  \r3\().16b
    trn1        \r0\().8h,  \t4\().8h,  \t6\().8h
    trn2        \r2\().8h,  \t4\().8h,  \t6\().8h
    trn1        \r1\().8h,  \t5\().8h,  \t7\().8h
    trn2        \r3\().8h,  \t5\().8h,  \t7\().8h
 .endm
 .macro  transpose_4x8.b  r0, r1, r2, r3, t4, t5, t6, t7
    trn1        \t4\().8b,  \r0\().8b,  \r1\().8b
    trn2        \t5\().8b,  \r0\().8b,  \r1\().8b
    trn1        \t6\().8b,  \r2\().8b,  \r3\().8b
    trn2        \t7\().8b,  \r2\().8b,  \r3\().8b
    trn1        \r0\().4h,  \t4\().4h,  \t6\().4h
    trn2        \r2\().4h,  \t4\().4h,  \t6\().4h
    trn1        \r1\().4h,  \t5\().4h,  \t7\().4h
    trn2        \r3\().4h,  \t5\().4h,  \t7\().4h
 .endm
--- a/common/aarch64/bitstream-a.S
+++ b/common/aarch64/bitstream-a.S
@@ -0,0 +1,82 @@
 /*****************************************************************************
 * bitstream-a.S: aarch64 bitstream functions
 *****************************************************************************
 * Copyright (C) 2014-2025 x264 project
 *
 * Authors: Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 function nal_escape_neon, export=1
    movi        v0.16b,  #0xff
    movi        v4.16b,  #4
    mov         w3,  #3
    subs        x6,  x1,  x2
    cbz         x6,  99f
 0:
    cmn         x6,  #15
    b.lt        16f
    mov         x1,  x2
    b           100f
 16:
    ld1         {v1.16b}, [x1], #16
    ext         v2.16b, v0.16b, v1.16b, #14
    ext         v3.16b, v0.16b, v1.16b, #15
    cmhi        v7.16b, v4.16b, v1.16b
    cmeq        v5.16b, v2.16b, #0
    cmeq        v6.16b, v3.16b, #0
    and         v5.16b, v5.16b, v7.16b
    and         v5.16b, v5.16b, v6.16b
    shrn        v7.8b,  v5.8h,  #4
    mov         x7,  v7.d[0]
    cbz         x7,  16f
    mov         x6,  #-16
 100:
    umov        w5,  v0.b[14]
    umov        w4,  v0.b[15]
    orr         w5,  w4,  w5, lsl #8
 101:
    ldrb        w4,  [x1, x6]
    orr         w9,  w4,  w5, lsl #16
    cmp         w9,  #3
    b.hi        102f
    strb        w3,  [x0], #1
    orr         w5,  w3,  w5, lsl #8
 102:
    adds        x6,  x6,  #1
    strb        w4,  [x0], #1
    orr         w5,  w4,  w5, lsl #8
    b.lt        101b
    subs        x6,  x1,  x2
    lsr         w9,  w5,  #8
    mov         v0.b[14],  w9
    mov         v0.b[15],  w5
    b.lt        0b
    ret
 16:
    subs        x6,  x1,  x2
    st1         {v1.16b}, [x0], #16
    mov         v0.16b, v1.16b
    b.lt        0b
 99:
    ret
 endfunc
--- a/common/aarch64/bitstream.h
+++ b/common/aarch64/bitstream.h
@@ -0,0 +1,32 @@
 /*****************************************************************************
 * bitstream.h: aarch64 bitstream functions
 *****************************************************************************
 * Copyright (C) 2017-2025 x264 project
 *
 * Authors: Anton Mitrofanov <BugMaster@narod.ru>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_BITSTREAM_H
 #define X264_AARCH64_BITSTREAM_H
 #define x264_nal_escape_neon x264_template(nal_escape_neon)
 uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
 #endif
--- a/common/aarch64/cabac-a.S
+++ b/common/aarch64/cabac-a.S
@@ -0,0 +1,131 @@
 /*****************************************************************************
 * cabac-a.S: aarch64 cabac
 *****************************************************************************
 * Copyright (C) 2014-2025 x264 project
 *
 * Authors: Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "asm-offsets.h"
 // w11 holds x264_cabac_t.i_low
 // w12 holds x264_cabac_t.i_range
 function cabac_encode_decision_asm, export=1
    add         w10, w1,  #CABAC_STATE
    ldrb        w3,  [x0,  w10, uxtw]           // i_state
    ldr         w12, [x0,  #CABAC_I_RANGE]
    movrel      x8,  X264(cabac_range_lps), -4
    movrel      x9,  X264(cabac_transition)
    ubfx        x4,  x3,  #1,  #7
    asr         w5,  w12, #6
    add         x8,  x8,  x4, lsl #2
    orr         w14, w2,  w3, lsl #1
    ldrb        w4,  [x8,  w5,  uxtw]           // i_range_lps
    ldr         w11, [x0,  #CABAC_I_LOW]
    eor         w6,  w2,  w3                    // b ^ i_state
    ldrb        w9,  [x9,  w14, uxtw]
    sub         w12, w12, w4
    add         w7,  w11, w12
    tst         w6,  #1                         // (b ^ i_state) & 1
    csel        w12, w4, w12, ne
    csel        w11, w7, w11, ne
    strb        w9,  [x0,  w10, uxtw]           // i_state
 cabac_encode_renorm:
    ldr         w2,  [x0, #CABAC_I_QUEUE]
    clz         w5,  w12
    sub         w5,  w5,  #23
    lsl         w11, w11, w5
    lsl         w12, w12, w5
    adds        w2,  w2,  w5
    b.ge        cabac_putbyte
    stp         w11, w12, [x0, #CABAC_I_LOW]    // store i_low, i_range
    str         w2,  [x0, #CABAC_I_QUEUE]
    ret
 .align 5
 cabac_putbyte:
    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
    add         w14, w2,  #10
    mov         w13, #-1
    sub         w2,  w2,  #8
    asr         w4,  w11, w14           // out
    lsl         w13, w13, w14
    subs        w5,  w4,  #0xff
    bic         w11, w11, w13
    cinc        w6,  w6,  eq
    b.eq        0f
 1:
    ldr         x7,  [x0, #CABAC_P]
    asr         w5,  w4,  #8            // carry
    ldurb       w8,  [x7, #-1]
    add         w8,  w8,  w5
    sub         w5,  w5,  #1
    sturb       w8,  [x7, #-1]
    cbz         w6,  3f
 2:
    subs        w6,  w6,  #1
    strb        w5,  [x7],  #1
    b.gt        2b
 3:
    strb        w4,  [x7],  #1
    str         x7,  [x0, #CABAC_P]
 0:
    stp         w11, w12, [x0, #CABAC_I_LOW]    // store i_low, i_range
    stp         w2,  w6,  [x0, #CABAC_I_QUEUE]  // store i_queue, i_bytes_outstanding
    ret
 endfunc
 function cabac_encode_bypass_asm, export=1, align=5
    ldr         w12, [x0, #CABAC_I_RANGE]
    ldr         w11, [x0, #CABAC_I_LOW]
    ldr         w2,  [x0, #CABAC_I_QUEUE]
    and         w1,  w1,  w12
    add         w11, w1,  w11, lsl #1
    adds        w2,  w2,  #1
    b.ge        cabac_putbyte
    str         w11, [x0, #CABAC_I_LOW]
    str         w2,  [x0, #CABAC_I_QUEUE]
    ret
 endfunc
 function cabac_encode_terminal_asm, export=1, align=5
    ldr         w12, [x0, #CABAC_I_RANGE]
    sub         w12, w12, #2
    tbz         w12, #8, 1f
    str         w12, [x0, #CABAC_I_RANGE]
    ret
 1:
    ldr         w2,  [x0, #CABAC_I_QUEUE]
    ldr         w11, [x0, #CABAC_I_LOW]
    lsl         w12, w12, #1
    adds        w2,  w2,  #1
    lsl         w11, w11, #1
    b.ge        cabac_putbyte
    stp         w11, w12, [x0, #CABAC_I_LOW]    // store i_low, i_range
    str         w2,  [x0, #CABAC_I_QUEUE]
    ret
 endfunc
--- a/common/aarch64/dct-a-common.S
+++ b/common/aarch64/dct-a-common.S
@@ -0,0 +1,40 @@
 /****************************************************************************
 * dct-a-common.S: aarch64 transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *          David Chen   <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 // This file contains the NEON macros that are intended to be used by
 // the SVE/SVE2 functions as well
 .macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
    SUMSUB_AB   \v1, \v6, \v5, \v6
    SUMSUB_AB   \v3, \v7, \v4, \v7
    add         \v0, \v3, \v1
    add         \v4, \v7, \v7
    add         \v5, \v6, \v6
    sub         \v2, \v3, \v1
    add         \v1, \v4, \v6
    sub         \v3, \v7, \v5
 .endm
--- a/common/aarch64/dct-a-sve.S
+++ b/common/aarch64/dct-a-sve.S
@@ -0,0 +1,88 @@
 /****************************************************************************
 * dct-a-sve.S: aarch64 transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Chen <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "dct-a-common.S"
 ENABLE_SVE
 function sub4x4_dct_sve, export=1
    mov         x3, #FENC_STRIDE
    mov         x4, #FDEC_STRIDE
    ptrue       p0.h, vl4
    ld1b        {z0.h}, p0/z, [x1]
    add         x1, x1, x3
    ld1b        {z1.h}, p0/z, [x2]
    add         x2, x2, x4
    ld1b        {z2.h}, p0/z, [x1]
    add         x1, x1, x3
    sub         v16.4h, v0.4h, v1.4h
    ld1b        {z3.h}, p0/z, [x2]
    add         x2, x2, x4
    ld1b        {z4.h}, p0/z, [x1]
    add         x1, x1, x3
    sub         v17.4h, v2.4h, v3.4h
    ld1b        {z5.h}, p0/z, [x2]
    add         x2, x2, x4
    ld1b        {z6.h}, p0/z, [x1]
    sub         v18.4h, v4.4h, v5.4h
    ld1b        {z7.h}, p0/z, [x2]
    sub         v19.4h, v6.4h, v7.4h
    DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
    transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
    DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
    st1         {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
    ret
 endfunc
 function zigzag_interleave_8x8_cavlc_sve, export=1
    mov         z31.s, #1
    ptrue       p2.s, vl2
    ld4         {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
    ld4         {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
    umax        v16.8h, v0.8h,  v4.8h
    umax        v17.8h, v1.8h,  v5.8h
    umax        v18.8h, v2.8h,  v6.8h
    umax        v19.8h, v3.8h,  v7.8h
    st1         {v0.8h}, [x0],  #16
    st1         {v4.8h}, [x0],  #16
    umaxp       v16.8h, v16.8h, v17.8h
    umaxp       v18.8h, v18.8h, v19.8h
    st1         {v1.8h}, [x0],  #16
    st1         {v5.8h}, [x0],  #16
    umaxp       v16.8h, v16.8h, v18.8h
    st1         {v2.8h}, [x0],  #16
    st1         {v6.8h}, [x0],  #16
    cmhs        v16.4s, v16.4s, v31.4s
    st1         {v3.8h}, [x0],  #16
    and         v16.16b, v16.16b, v31.16b
    st1         {v7.8h}, [x0],  #16
    st1b        {z16.s}, p2, [x2]
    add         x2, x2, #8
    mov         v16.d[0], v16.d[1]
    st1b        {z16.s}, p2, [x2]
    ret
 endfunc
--- a/common/aarch64/dct-a-sve2.S
+++ b/common/aarch64/dct-a-sve2.S
@@ -0,0 +1,90 @@
 /****************************************************************************
 * dct-a-sve2.S: aarch64 transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Chen <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "dct-a-common.S"
 ENABLE_SVE
 ENABLE_SVE2
 function add4x4_idct_sve2, export=1
    mov         x2, #FDEC_STRIDE
    mov         x11, x0
    ptrue       p0.h, vl8
    ptrue       p1.h, vl4
    ld1         {v0.8h, v1.8h}, [x1]
    SUMSUB_AB   v4.8h, v5.8h, v0.8h, v1.8h
    sshr        v7.8h, v0.8h, #1
    sshr        v6.8h, v1.8h, #1
    sub         v7.8h, v7.8h, v1.8h
    add         v6.8h, v6.8h, v0.8h
    mov         v7.d[0], v7.d[1]
    mov         v6.d[0], v6.d[1]
    ld1b        {z28.h}, p0/z, [x11]
    add         x11, x11, x2
    SUMSUB_AB   v0.8h, v2.8h, v4.8h, v6.8h
    SUMSUB_AB   v1.8h, v3.8h, v5.8h, v7.8h
    transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
    SUMSUB_AB   v4.4h, v5.4h, v0.4h, v3.4h
    sshr        v7.4h, v1.4h, #1
    sshr        v6.4h, v2.4h, #1
    sub         v7.4h, v7.4h, v2.4h
    add         v6.4h, v6.4h, v1.4h
    ld1b        {z29.h}, p0/z, [x11]
    add         x11, x11, x2
    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
    srshr       z0.h, p1/m, z0.h, #6
    srshr       z1.h, p1/m, z1.h, #6
    ld1b        {z31.h}, p0/z, [x11]
    add         x11, x11, x2
    srshr       z2.h, p1/m, z2.h, #6
    srshr       z3.h, p1/m, z3.h, #6
    ld1b        {z30.h}, p0/z, [x11]
    add         v0.8h, v0.8h, v28.8h
    add         v1.8h, v1.8h, v29.8h
    add         v2.8h, v2.8h, v30.8h
    add         v3.8h, v3.8h, v31.8h
    sqxtunb     z0.b, z0.h
    sqxtunb     z1.b, z1.h
    sqxtunb     z2.b, z2.h
    sqxtunb     z3.b, z3.h
    st1b        {z0.h}, p1, [x0]
    add         x0, x0, x2
    st1b        {z1.h}, p1, [x0]
    add         x0, x0, x2
    st1b        {z3.h}, p1, [x0]
    add         x0, x0, x2
    st1b        {z2.h}, p1, [x0]
    ret
 endfunc
--- a/common/aarch64/dct-a.S
+++ b/common/aarch64/dct-a.S
@@ -0,0 +1,998 @@
 /****************************************************************************
 * dct-a.S: aarch64 transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "dct-a-common.S"
 const scan4x4_frame, align=4
 .byte    0,1,   8,9,   2,3,   4,5
 .byte   10,11, 16,17, 24,25, 18,19
 .byte   12,13,  6,7,  14,15, 20,21
 .byte   26,27, 28,29, 22,23, 30,31
 endconst
 const scan4x4_field, align=4
 .byte    0,1,   2,3,   8,9,   4,5
 .byte    6,7,  10,11, 12,13, 14,15
 endconst
 const sub4x4_frame, align=4
 .byte    0,  1,  4,  8
 .byte    5,  2,  3,  6
 .byte    9, 12, 13, 10
 .byte    7, 11, 14, 15
 endconst
 const sub4x4_field, align=4
 .byte    0,  4,  1,  8
 .byte   12,  5,  9, 13
 .byte    2,  6, 10, 14
 .byte    3,  7, 11, 15
 endconst
 // sum = a + (b>>shift)   sub = (a>>shift) - b
 .macro SUMSUB_SHR shift sum sub a b t0 t1
    sshr        \t0,  \b, #\shift
    sshr        \t1,  \a, #\shift
    add         \sum, \a, \t0
    sub         \sub, \t1, \b
 .endm
 // sum = (a>>shift) + b   sub = a - (b>>shift)
 .macro SUMSUB_SHR2 shift sum sub a b t0 t1
    sshr        \t0,  \a, #\shift
    sshr        \t1,  \b, #\shift
    add         \sum, \t0, \b
    sub         \sub, \a, \t1
 .endm
 // a += 1.5*ma   b -= 1.5*mb
 .macro SUMSUB_15 a b ma mb t0 t1
    sshr        \t0, \ma, #1
    sshr        \t1, \mb, #1
    add         \t0, \t0, \ma
    add         \t1, \t1, \mb
    add         \a,  \a,  \t0
    sub         \b,  \b,  \t1
 .endm
 function dct4x4dc_neon, export=1
    ld1         {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
    movi        v31.4h, #1
    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
    add         v16.4h, v4.4h,  v31.4h
    add         v17.4h, v6.4h,  v31.4h
    srhadd      v0.4h,  v4.4h,  v5.4h
    shsub       v1.4h,  v16.4h, v5.4h
    shsub       v2.4h,  v17.4h, v7.4h
    srhadd      v3.4h,  v6.4h,  v7.4h
    st1         {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
    ret
 endfunc
 function idct4x4dc_neon, export=1
    ld1         {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
    SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
    SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
    SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
    transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
    transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
    SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
    SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
    transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
    transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
    SUMSUB_AB   v0.4h,  v1.4h,  v4.4h,  v5.4h
    SUMSUB_AB   v3.4h,  v2.4h,  v6.4h,  v7.4h
    st1         {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
    ret
 endfunc
 function sub4x4_dct_neon, export=1
    mov         x3, #FENC_STRIDE
    mov         x4, #FDEC_STRIDE
    ld1         {v0.s}[0], [x1], x3
    ld1         {v1.s}[0], [x2], x4
    ld1         {v2.s}[0], [x1], x3
    usubl       v16.8h, v0.8b,  v1.8b
    ld1         {v3.s}[0], [x2], x4
    ld1         {v4.s}[0], [x1], x3
    usubl       v17.8h, v2.8b,  v3.8b
    ld1         {v5.s}[0], [x2], x4
    ld1         {v6.s}[0], [x1], x3
    usubl       v18.8h, v4.8b,  v5.8b
    ld1         {v7.s}[0], [x2], x4
    usubl       v19.8h, v6.8b,  v7.8b
    DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
    transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
    DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
    st1         {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
    ret
 endfunc
 function sub8x4_dct_neon
    ld1         {v0.8b}, [x1], x3
    ld1         {v1.8b}, [x2], x4
    usubl       v16.8h, v0.8b,  v1.8b
    ld1         {v2.8b}, [x1], x3
    ld1         {v3.8b}, [x2], x4
    usubl       v17.8h, v2.8b,  v3.8b
    ld1         {v4.8b}, [x1], x3
    ld1         {v5.8b}, [x2], x4
    usubl       v18.8h, v4.8b,  v5.8b
    ld1         {v6.8b}, [x1], x3
    ld1         {v7.8b}, [x2], x4
    usubl       v19.8h, v6.8b,  v7.8b
    DCT_1D      v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
    transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
    SUMSUB_AB   v16.8h, v19.8h, v0.8h,  v3.8h
    SUMSUB_AB   v17.8h, v18.8h, v1.8h,  v2.8h
    add         v22.8h, v19.8h, v19.8h
    add         v21.8h, v18.8h, v18.8h
    add         v0.8h,  v16.8h, v17.8h
    sub         v1.8h,  v16.8h, v17.8h
    add         v2.8h,  v22.8h, v18.8h
    sub         v3.8h,  v19.8h, v21.8h
    zip1        v4.2d,  v0.2d,  v2.2d
    zip2        v6.2d,  v0.2d,  v2.2d
    zip1        v5.2d,  v1.2d,  v3.2d
    zip2        v7.2d,  v1.2d,  v3.2d
    st1         {v4.8h}, [x0], #16
    st1         {v5.8h}, [x0], #16
    st1         {v6.8h}, [x0], #16
    st1         {v7.8h}, [x0], #16
    ret
 endfunc
 function sub8x8_dct_neon, export=1
    mov         x5,  x30
    mov         x3, #FENC_STRIDE
    mov         x4, #FDEC_STRIDE
    bl          sub8x4_dct_neon
    mov         x30, x5
    b           sub8x4_dct_neon
 endfunc
 function sub16x16_dct_neon, export=1
    mov         x5,  x30
    mov         x3, #FENC_STRIDE
    mov         x4, #FDEC_STRIDE
    bl          sub8x4_dct_neon
    bl          sub8x4_dct_neon
    sub         x1, x1, #8*FENC_STRIDE-8
    sub         x2, x2, #8*FDEC_STRIDE-8
    bl          sub8x4_dct_neon
    bl          sub8x4_dct_neon
    sub         x1, x1, #8
    sub         x2, x2, #8
    bl          sub8x4_dct_neon
    bl          sub8x4_dct_neon
    sub         x1, x1, #8*FENC_STRIDE-8
    sub         x2, x2, #8*FDEC_STRIDE-8
    bl          sub8x4_dct_neon
    mov         x30, x5
    b           sub8x4_dct_neon
 endfunc
 .macro DCT8_1D type
    SUMSUB_AB   v18.8h, v17.8h, v3.8h,  v4.8h   // s34/d34
    SUMSUB_AB   v19.8h, v16.8h, v2.8h,  v5.8h   // s25/d25
    SUMSUB_AB   v22.8h, v21.8h, v1.8h,  v6.8h   // s16/d16
    SUMSUB_AB   v23.8h, v20.8h, v0.8h,  v7.8h   // s07/d07
    SUMSUB_AB   v24.8h, v26.8h,  v23.8h, v18.8h  // a0/a2
    SUMSUB_AB   v25.8h, v27.8h,  v22.8h, v19.8h  // a1/a3
    SUMSUB_AB   v30.8h, v29.8h,  v20.8h, v17.8h  // a6/a5
    sshr        v23.8h, v21.8h, #1
    sshr        v18.8h, v16.8h, #1
    add         v23.8h, v23.8h, v21.8h
    add         v18.8h, v18.8h, v16.8h
    sub         v30.8h, v30.8h, v23.8h
    sub         v29.8h, v29.8h, v18.8h
    SUMSUB_AB   v28.8h, v31.8h,  v21.8h, v16.8h   // a4/a7
    sshr        v22.8h, v20.8h, #1
    sshr        v19.8h, v17.8h, #1
    add         v22.8h, v22.8h, v20.8h
    add         v19.8h, v19.8h, v17.8h
    add         v22.8h, v28.8h, v22.8h
    add         v31.8h, v31.8h, v19.8h
    SUMSUB_AB   v0.8h,  v4.8h,  v24.8h, v25.8h
    SUMSUB_SHR  2, v1.8h,  v7.8h,  v22.8h, v31.8h, v16.8h, v17.8h
    SUMSUB_SHR  1, v2.8h,  v6.8h,  v26.8h, v27.8h, v18.8h, v19.8h
    SUMSUB_SHR2 2, v3.8h,  v5.8h,  v30.8h, v29.8h, v20.8h, v21.8h
 .endm
 function sub8x8_dct8_neon, export=1
    mov         x3, #FENC_STRIDE
    mov         x4, #FDEC_STRIDE
    ld1         {v16.8b}, [x1], x3
    ld1         {v17.8b}, [x2], x4
    ld1         {v18.8b}, [x1], x3
    ld1         {v19.8b}, [x2], x4
    usubl       v0.8h,  v16.8b, v17.8b
    ld1         {v20.8b}, [x1], x3
    ld1         {v21.8b}, [x2], x4
    usubl       v1.8h,  v18.8b, v19.8b
    ld1         {v22.8b}, [x1], x3
    ld1         {v23.8b}, [x2], x4
    usubl       v2.8h,  v20.8b, v21.8b
    ld1         {v24.8b}, [x1], x3
    ld1         {v25.8b}, [x2], x4
    usubl       v3.8h,  v22.8b, v23.8b
    ld1         {v26.8b}, [x1], x3
    ld1         {v27.8b}, [x2], x4
    usubl       v4.8h,  v24.8b, v25.8b
    ld1         {v28.8b}, [x1], x3
    ld1         {v29.8b}, [x2], x4
    usubl       v5.8h,  v26.8b, v27.8b
    ld1         {v30.8b}, [x1], x3
    ld1         {v31.8b}, [x2], x4
    usubl       v6.8h,  v28.8b, v29.8b
    usubl       v7.8h,  v30.8b, v31.8b
    DCT8_1D     row
    transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
    DCT8_1D     col
    st1         {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
    st1         {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
    ret
 endfunc
 function sub16x16_dct8_neon, export=1
    mov         x7,  x30
    bl          X(sub8x8_dct8_neon)
    sub         x1,  x1,  #FENC_STRIDE*8 - 8
    sub         x2,  x2,  #FDEC_STRIDE*8 - 8
    bl          X(sub8x8_dct8_neon)
    sub         x1,  x1,  #8
    sub         x2,  x2,  #8
    bl          X(sub8x8_dct8_neon)
    mov         x30, x7
    sub         x1,  x1,  #FENC_STRIDE*8 - 8
    sub         x2,  x2,  #FDEC_STRIDE*8 - 8
    b           X(sub8x8_dct8_neon)
 endfunc
 // First part of IDCT (minus final SUMSUB_BA)
 .macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
    SUMSUB_AB   \d4, \d5, \d0, \d2
    sshr        \d7, \d1, #1
    sshr        \d6, \d3, #1
    sub         \d7, \d7, \d3
    add         \d6, \d6, \d1
 .endm
 function add4x4_idct_neon, export=1
    mov         x2, #FDEC_STRIDE
    ld1         {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
    IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
    ld1         {v28.s}[0], [x0], x2
    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
    transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
    IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
    ld1         {v29.s}[0], [x0], x2
    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
    srshr       v0.4h,  v0.4h,  #6
    srshr       v1.4h,  v1.4h,  #6
    ld1         {v31.s}[0], [x0], x2
    srshr       v2.4h,  v2.4h,  #6
    srshr       v3.4h,  v3.4h,  #6
    ld1         {v30.s}[0], [x0], x2
    sub         x0,  x0,  x2,  lsl #2
    uaddw       v0.8h,  v0.8h,  v28.8b
    uaddw       v1.8h,  v1.8h,  v29.8b
    uaddw       v2.8h,  v2.8h,  v30.8b
    uaddw       v3.8h,  v3.8h,  v31.8b
    sqxtun      v0.8b,  v0.8h
    sqxtun      v1.8b,  v1.8h
    sqxtun      v2.8b,  v2.8h
    sqxtun      v3.8b,  v3.8h
    st1         {v0.s}[0], [x0], x2
    st1         {v1.s}[0], [x0], x2
    st1         {v3.s}[0], [x0], x2
    st1         {v2.s}[0], [x0], x2
    ret
 endfunc
 function add8x4_idct_neon, export=1
    ld1         {v0.8h,v1.8h}, [x1], #32
    ld1         {v2.8h,v3.8h}, [x1], #32
    transpose   v20.2d, v21.2d, v0.2d, v2.2d
    transpose   v22.2d, v23.2d, v1.2d, v3.2d
    IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
    SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
    SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
    transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
    IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
    SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
    SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
    srshr       v0.8h,  v0.8h,  #6
    ld1         {v28.8b}, [x0], x2
    srshr       v1.8h,  v1.8h,  #6
    ld1         {v29.8b}, [x0], x2
    srshr       v2.8h,  v2.8h,  #6
    ld1         {v30.8b}, [x0], x2
    srshr       v3.8h,  v3.8h,  #6
    ld1         {v31.8b}, [x0], x2
    sub         x0,  x0,  x2,  lsl #2
    uaddw       v0.8h,  v0.8h,  v28.8b
    uaddw       v1.8h,  v1.8h,  v29.8b
    uaddw       v2.8h,  v2.8h,  v30.8b
    uaddw       v3.8h,  v3.8h,  v31.8b
    sqxtun      v0.8b,  v0.8h
    sqxtun      v1.8b,  v1.8h
    st1         {v0.8b}, [x0], x2
    sqxtun      v2.8b,  v2.8h
    st1         {v1.8b}, [x0], x2
    sqxtun      v3.8b,  v3.8h
    st1         {v2.8b}, [x0], x2
    st1         {v3.8b}, [x0], x2
    ret
 endfunc
 function add8x8_idct_neon, export=1
    mov             x2, #FDEC_STRIDE
    mov             x5,  x30
    bl              X(add8x4_idct_neon)
    mov             x30, x5
    b               X(add8x4_idct_neon)
 endfunc
 function add16x16_idct_neon, export=1
    mov             x2, #FDEC_STRIDE
    mov             x5,  x30
    bl              X(add8x4_idct_neon)
    bl              X(add8x4_idct_neon)
    sub             x0, x0, #8*FDEC_STRIDE-8
    bl              X(add8x4_idct_neon)
    bl              X(add8x4_idct_neon)
    sub             x0, x0, #8
    bl              X(add8x4_idct_neon)
    bl              X(add8x4_idct_neon)
    sub             x0, x0, #8*FDEC_STRIDE-8
    bl              X(add8x4_idct_neon)
    mov             x30, x5
    b               X(add8x4_idct_neon)
 endfunc
 .macro IDCT8_1D type
    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v20.8h          // a0/a2
 .ifc \type, row
    ld1         {v22.8h,v23.8h}, [x1], #32
 .endif
    SUMSUB_SHR  1, v2.8h,  v3.8h,  v18.8h, v22.8h, v16.8h, v20.8h   // a6/a4
    SUMSUB_AB   v16.8h, v18.8h, v21.8h, v19.8h
    SUMSUB_15   v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h      // a7/a1
    SUMSUB_AB   v22.8h, v23.8h, v23.8h, v17.8h
    SUMSUB_15   v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h      // a5/a3
    SUMSUB_SHR  2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h   // b3/b5
    SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h   // b1/b7
    SUMSUB_AB   v18.8h, v2.8h,  v0.8h,  v2.8h           // b0/b6
    SUMSUB_AB   v19.8h, v3.8h,  v1.8h,  v3.8h           // b2/b4
    SUMSUB_AB   v16.8h, v23.8h, v18.8h, v23.8h
    SUMSUB_AB   v17.8h, v22.8h, v19.8h, v22.8h
    SUMSUB_AB   v18.8h, v21.8h, v3.8h,  v21.8h
    SUMSUB_AB   v19.8h, v20.8h, v2.8h,  v20.8h
 .endm
 function add8x8_idct8_neon, export=1
    mov         x2,  #FDEC_STRIDE
    ld1         {v16.8h,v17.8h}, [x1], #32
    ld1         {v18.8h,v19.8h}, [x1], #32
    ld1         {v20.8h,v21.8h}, [x1], #32
    IDCT8_1D    row
    transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
    IDCT8_1D    col
    ld1         {v0.8b}, [x0], x2
    srshr       v16.8h, v16.8h, #6
    ld1         {v1.8b}, [x0], x2
    srshr       v17.8h, v17.8h, #6
    ld1         {v2.8b}, [x0], x2
    srshr       v18.8h, v18.8h, #6
    ld1         {v3.8b}, [x0], x2
    srshr       v19.8h, v19.8h, #6
    ld1         {v4.8b}, [x0], x2
    srshr       v20.8h, v20.8h, #6
    ld1         {v5.8b}, [x0], x2
    srshr       v21.8h, v21.8h, #6
    ld1         {v6.8b}, [x0], x2
    srshr       v22.8h, v22.8h, #6
    ld1         {v7.8b}, [x0], x2
    srshr       v23.8h, v23.8h, #6
    sub         x0,  x0,  x2,  lsl #3
    uaddw       v16.8h, v16.8h, v0.8b
    uaddw       v17.8h, v17.8h, v1.8b
    uaddw       v18.8h, v18.8h, v2.8b
    sqxtun      v0.8b,  v16.8h
    sqxtun      v1.8b,  v17.8h
    sqxtun      v2.8b,  v18.8h
    uaddw       v19.8h, v19.8h, v3.8b
    st1         {v0.8b}, [x0], x2
    uaddw       v20.8h, v20.8h, v4.8b
    st1         {v1.8b}, [x0], x2
    uaddw       v21.8h, v21.8h, v5.8b
    st1         {v2.8b}, [x0], x2
    sqxtun      v3.8b,  v19.8h
    sqxtun      v4.8b,  v20.8h
    uaddw       v22.8h, v22.8h, v6.8b
    uaddw       v23.8h, v23.8h, v7.8b
    st1         {v3.8b}, [x0], x2
    sqxtun      v5.8b,  v21.8h
    st1         {v4.8b}, [x0], x2
    sqxtun      v6.8b,  v22.8h
    sqxtun      v7.8b,  v23.8h
    st1         {v5.8b}, [x0], x2
    st1         {v6.8b}, [x0], x2
    st1         {v7.8b}, [x0], x2
    ret
 endfunc
 function add16x16_idct8_neon, export=1
    mov             x7,  x30
    bl              X(add8x8_idct8_neon)
    sub             x0,  x0,  #8*FDEC_STRIDE-8
    bl              X(add8x8_idct8_neon)
    sub             x0,  x0,  #8
    bl              X(add8x8_idct8_neon)
    sub             x0,  x0,  #8*FDEC_STRIDE-8
    mov             x30, x7
    b               X(add8x8_idct8_neon)
 endfunc
 function add8x8_idct_dc_neon, export=1
    mov         x2,  #FDEC_STRIDE
    ld1         {v16.4h}, [x1]
    ld1         {v0.8b}, [x0], x2
    srshr       v16.4h, v16.4h, #6
    ld1         {v1.8b}, [x0], x2
    dup         v20.8h, v16.h[0]
    dup         v21.8h, v16.h[1]
    ld1         {v2.8b}, [x0], x2
    dup         v22.8h, v16.h[2]
    dup         v23.8h, v16.h[3]
    ld1         {v3.8b}, [x0], x2
    trn1        v20.2d, v20.2d,  v21.2d
    ld1         {v4.8b}, [x0], x2
    trn1        v21.2d, v22.2d,  v23.2d
    ld1         {v5.8b}, [x0], x2
    neg         v22.8h, v20.8h
    ld1         {v6.8b}, [x0], x2
    neg         v23.8h, v21.8h
    ld1         {v7.8b}, [x0], x2
    sub         x0,  x0,  #8*FDEC_STRIDE
    sqxtun      v20.8b,  v20.8h
    sqxtun      v21.8b,  v21.8h
    sqxtun      v22.8b,  v22.8h
    sqxtun      v23.8b,  v23.8h
    uqadd       v0.8b,  v0.8b,  v20.8b
    uqadd       v1.8b,  v1.8b,  v20.8b
    uqadd       v2.8b,  v2.8b,  v20.8b
    uqadd       v3.8b,  v3.8b,  v20.8b
    uqadd       v4.8b,  v4.8b,  v21.8b
    uqadd       v5.8b,  v5.8b,  v21.8b
    uqadd       v6.8b,  v6.8b,  v21.8b
    uqadd       v7.8b,  v7.8b,  v21.8b
    uqsub       v0.8b,  v0.8b,  v22.8b
    uqsub       v1.8b,  v1.8b,  v22.8b
    uqsub       v2.8b,  v2.8b,  v22.8b
    uqsub       v3.8b,  v3.8b,  v22.8b
    uqsub       v4.8b,  v4.8b,  v23.8b
    uqsub       v5.8b,  v5.8b,  v23.8b
    uqsub       v6.8b,  v6.8b,  v23.8b
    uqsub       v7.8b,  v7.8b,  v23.8b
    st1         {v0.8b}, [x0], x2
    st1         {v1.8b}, [x0], x2
    st1         {v2.8b}, [x0], x2
    st1         {v3.8b}, [x0], x2
    st1         {v4.8b}, [x0], x2
    st1         {v5.8b}, [x0], x2
    st1         {v6.8b}, [x0], x2
    st1         {v7.8b}, [x0], x2
    ret
 endfunc
 .macro ADD16x4_IDCT_DC dc
    ld1         {v4.16b}, [x0], x3
    dup         v24.8h,  \dc[0]
    dup         v25.8h,  \dc[1]
    ld1         {v5.16b}, [x0], x3
    dup         v26.8h,  \dc[2]
    dup         v27.8h,  \dc[3]
    ld1         {v6.16b}, [x0], x3
    trn1        v24.2d,  v24.2d,  v25.2d
    ld1         {v7.16b}, [x0], x3
    trn1        v25.2d,  v26.2d,  v27.2d
    neg         v26.8h,  v24.8h
    neg         v27.8h,  v25.8h
    sqxtun      v20.8b,  v24.8h
    sqxtun      v21.8b,  v26.8h
    sqxtun2     v20.16b, v25.8h
    sqxtun2     v21.16b, v27.8h
    uqadd       v4.16b, v4.16b, v20.16b
    uqadd       v5.16b, v5.16b, v20.16b
    uqadd       v6.16b, v6.16b, v20.16b
    uqadd       v7.16b, v7.16b, v20.16b
    uqsub       v4.16b, v4.16b, v21.16b
    uqsub       v5.16b, v5.16b, v21.16b
    uqsub       v6.16b, v6.16b, v21.16b
    st1         {v4.16b}, [x2], x3
    uqsub       v7.16b, v7.16b, v21.16b
    st1         {v5.16b}, [x2], x3
    st1         {v6.16b}, [x2], x3
    st1         {v7.16b}, [x2], x3
 .endm
 function add16x16_idct_dc_neon, export=1
    mov         x2,  x0
    mov         x3,  #FDEC_STRIDE
    ld1         {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
    srshr       v0.4h,  v0.4h,  #6
    srshr       v1.4h,  v1.4h,  #6
    ADD16x4_IDCT_DC v0.h
    srshr       v2.4h,  v2.4h,  #6
    ADD16x4_IDCT_DC v1.h
    srshr       v3.4h,  v3.4h,  #6
    ADD16x4_IDCT_DC v2.h
    ADD16x4_IDCT_DC v3.h
    ret
 endfunc
 .macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
    ld1         {\t0\().8b}, [x1], x3
    ld1         {\t1\().8b}, [x2], x4
    ld1         {\t2\().8b}, [x1], x3
    ld1         {\t3\().8b}, [x2], x4
    usubl       \t0\().8h,  \t0\().8b,  \t1\().8b
    ld1         {\t4\().8b}, [x1], x3
    ld1         {\t5\().8b}, [x2], x4
    usubl       \t1\().8h,  \t2\().8b,  \t3\().8b
    ld1         {\t6\().8b}, [x1], x3
    ld1         {\t7\().8b}, [x2], x4
    add         \dst\().8h, \t0\().8h,  \t1\().8h
    usubl       \t2\().8h,  \t4\().8b,  \t5\().8b
    usubl       \t3\().8h,  \t6\().8b,  \t7\().8b
    add         \dst\().8h, \dst\().8h, \t2\().8h
    add         \dst\().8h, \dst\().8h, \t3\().8h
 .endm
 function sub8x8_dct_dc_neon, export=1
    mov         x3,  #FENC_STRIDE
    mov         x4,  #FDEC_STRIDE
    sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
    sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
    addp        v0.8h,  v2.8h,  v3.8h
    addp        v0.8h,  v0.8h,  v0.8h
    st1         {v0.4h}, [x0]
    ret
 endfunc
 function sub8x16_dct_dc_neon, export=1
    mov         x3,  #FENC_STRIDE
    mov         x4,  #FDEC_STRIDE
    sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
    sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
    sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
    sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
    addp        v4.8h,  v0.8h,  v2.8h
    addp        v5.8h,  v1.8h,  v3.8h
    transpose   v2.4s,  v3.4s,  v4.4s,  v5.4s
    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
    transpose   v2.4s,  v3.4s,  v0.4s,  v1.4s
    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
    trn1        v2.2d,  v0.2d,  v1.2d
    trn2        v3.2d,  v1.2d,  v0.2d
    addp        v0.8h,  v2.8h,  v3.8h
    st1         {v0.8h}, [x0]
    ret
 endfunc
 function zigzag_interleave_8x8_cavlc_neon, export=1
    mov         x3,  #7
    movi        v31.4s, #1
    ld4         {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
    ld4         {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
    umax        v16.8h, v0.8h,  v4.8h
    umax        v17.8h, v1.8h,  v5.8h
    umax        v18.8h, v2.8h,  v6.8h
    umax        v19.8h, v3.8h,  v7.8h
    st1         {v0.8h}, [x0],  #16
    st1         {v4.8h}, [x0],  #16
    umaxp       v16.8h, v16.8h, v17.8h
    umaxp       v18.8h, v18.8h, v19.8h
    st1         {v1.8h}, [x0],  #16
    st1         {v5.8h}, [x0],  #16
    umaxp       v16.8h, v16.8h, v18.8h
    st1         {v2.8h}, [x0],  #16
    st1         {v6.8h}, [x0],  #16
    cmhs        v16.4s, v16.4s, v31.4s
    st1         {v3.8h}, [x0],  #16
    and         v16.16b, v16.16b, v31.16b
    st1         {v7.8h}, [x0],  #16
    st1         {v16.b}[0],    [x2],  #1
    st1         {v16.b}[4],    [x2],  x3
    st1         {v16.b}[8],    [x2],  #1
    st1         {v16.b}[12],   [x2]
    ret
 endfunc
 function zigzag_scan_4x4_frame_neon, export=1
    movrel      x2, scan4x4_frame
    ld1         {v0.16b,v1.16b}, [x1]
    ld1         {v16.16b,v17.16b}, [x2]
    tbl         v2.16b, {v0.16b,v1.16b}, v16.16b
    tbl         v3.16b, {v0.16b,v1.16b}, v17.16b
    st1         {v2.16b,v3.16b},   [x0]
    ret
 endfunc
 .macro zigzag_sub_4x4 f ac
 function zigzag_sub_4x4\ac\()_\f\()_neon, export=1
    mov         x9,  #FENC_STRIDE
    mov         x4,  #FDEC_STRIDE
    movrel      x5,  sub4x4_\f
    mov         x6,  x2
    ld1         {v0.s}[0], [x1], x9
    ld1         {v0.s}[1], [x1], x9
    ld1         {v0.s}[2], [x1], x9
    ld1         {v0.s}[3], [x1], x9
    ld1         {v16.16b}, [x5]
    ld1         {v1.s}[0], [x2], x4
    ld1         {v1.s}[1], [x2], x4
    ld1         {v1.s}[2], [x2], x4
    ld1         {v1.s}[3], [x2], x4
    tbl         v2.16b, {v0.16b}, v16.16b
    tbl         v3.16b, {v1.16b}, v16.16b
    st1         {v0.s}[0], [x6], x4
    usubl       v4.8h,  v2.8b,  v3.8b
 .ifc \ac, ac
    dup         h7, v4.h[0]
    ins         v4.h[0], wzr
    fmov        w5,  s7
    strh        w5,  [x3]
 .endif
    usubl2      v5.8h,  v2.16b, v3.16b
    st1         {v0.s}[1], [x6], x4
    umax        v6.8h,  v4.8h,  v5.8h
    umaxv       h6,  v6.8h
    st1         {v0.s}[2], [x6], x4
    fmov        w7,  s6
    st1         {v0.s}[3], [x6], x4
    cmp         w7, #0
    st1         {v4.8h,v5.8h},   [x0]
    cset        w0, ne
    ret
 endfunc
 .endm
 zigzag_sub_4x4 field
 zigzag_sub_4x4 field, ac
 zigzag_sub_4x4 frame
 zigzag_sub_4x4 frame, ac
 function zigzag_scan_4x4_field_neon, export=1
    movrel      x2, scan4x4_field
    ld1         {v0.8h,v1.8h},   [x1]
    ld1         {v16.16b},       [x2]
    tbl         v0.16b, {v0.16b}, v16.16b
    st1         {v0.8h,v1.8h},   [x0]
    ret
 endfunc
 function zigzag_scan_8x8_frame_neon, export=1
    movrel      x2,  scan8x8_frame
    ld1         {v0.8h,v1.8h},   [x1], #32
    ld1         {v2.8h,v3.8h},   [x1], #32
    ld1         {v4.8h,v5.8h},   [x1], #32
    ld1         {v6.8h,v7.8h},   [x1]
    ld1         {v16.16b,v17.16b}, [x2], #32
    ld1         {v18.16b,v19.16b}, [x2], #32
    ld1         {v20.16b,v21.16b}, [x2], #32
    ld1         {v22.16b,v23.16b}, [x2], #32
    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
    tbl         v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
    tbl         v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
    mov         v25.h[6], v4.h[0]
    mov         v25.h[7], v5.h[0]
    mov         v26.h[0], v4.h[1]
    mov         v27.h[4], v7.h[0]
    mov         v28.h[7], v4.h[4]
    mov         v29.h[7], v3.h[6]
    mov         v30.h[0], v2.h[7]
    mov         v30.h[1], v3.h[7]
    st1         {v24.8h,v25.8h}, [x0], #32
    st1         {v26.8h,v27.8h}, [x0], #32
    st1         {v28.8h,v29.8h}, [x0], #32
    st1         {v30.8h,v31.8h}, [x0]
    ret
 endfunc
 #define Z(z)   2*(z), 2*(z)+1
 #define T(x,y) Z(x*8+y)
 const scan8x8_frame, align=5
    .byte T(0,0), T(1,0), T(0,1), T(0,2)
    .byte T(1,1), T(2,0), T(3,0), T(2,1)
    .byte T(1,2), T(0,3), T(0,4), T(1,3)
    .byte T(2,2), T(3,1), T(4,0), T(5,0)
    .byte T(4,1), T(3,2), T(2,3), T(1,4)
    .byte T(0,5), T(0,6), T(1,5), T(2,4)
 #undef T
 #define T(x,y) Z((x-3)*8+y)
    .byte T(3,3), T(4,2), T(5,1), T(6,0)
    .byte T(7,0), T(6,1), T(5,2), T(4,3)
 #undef T
 #define T(x,y) Z((x-0)*8+y)
    .byte T(3,4), T(2,5), T(1,6), T(0,7)
    .byte T(1,7), T(2,6), T(3,5), T(4,4)
 #undef T
 #define T(x,y) Z((x-4)*8+y)
    .byte T(5,3), T(6,2), T(7,1), T(7,2)
    .byte T(6,3), T(5,4), T(4,5), T(3,6)
    .byte T(2,7), T(3,7), T(4,6), T(5,5)
    .byte T(6,4), T(7,3), T(7,4), T(6,5)
    .byte T(5,6), T(4,7), T(5,7), T(6,6)
    .byte T(7,5), T(7,6), T(6,7), T(7,7)
 endconst
 function zigzag_scan_8x8_field_neon, export=1
    movrel      x2,  scan8x8_field
    ld1         {v0.8h,v1.8h},   [x1], #32
    ld1         {v2.8h,v3.8h},   [x1], #32
    ld1         {v4.8h,v5.8h},   [x1], #32
    ld1         {v6.8h,v7.8h},   [x1]
    ld1         {v16.16b,v17.16b}, [x2], #32
    ld1         {v18.16b,v19.16b}, [x2], #32
    ld1         {v20.16b,v21.16b}, [x2], #32
    ld1         {v22.16b}, [x2]
    ext         v31.16b, v7.16b, v7.16b, #4
    tbl         v24.16b, {v0.16b,v1.16b},               v16.16b
    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
    tbl         v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
    tbl         v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
    tbl         v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
    tbl         v29.16b, {v4.16b,v5.16b,v6.16b},        v21.16b
    tbl         v30.16b, {v5.16b,v6.16b,v7.16b},        v22.16b
    ext         v31.16b, v6.16b, v31.16b, #12
    st1         {v24.8h,v25.8h}, [x0], #32
    st1         {v26.8h,v27.8h}, [x0], #32
    st1         {v28.8h,v29.8h}, [x0], #32
    st1         {v30.8h,v31.8h}, [x0]
    ret
 endfunc
 .macro zigzag_sub8x8 f
 function zigzag_sub_8x8_\f\()_neon, export=1
    movrel      x4,  sub8x8_\f
    mov         x5,  #FENC_STRIDE
    mov         x6,  #FDEC_STRIDE
    mov         x7,  x2
    ld1         {v0.d}[0], [x1], x5
    ld1         {v0.d}[1], [x1], x5
    ld1         {v1.d}[0], [x1], x5
    ld1         {v1.d}[1], [x1], x5
    ld1         {v2.d}[0], [x1], x5
    ld1         {v2.d}[1], [x1], x5
    ld1         {v3.d}[0], [x1], x5
    ld1         {v3.d}[1], [x1]
    ld1         {v4.d}[0], [x2], x6
    ld1         {v4.d}[1], [x2], x6
    ld1         {v5.d}[0], [x2], x6
    ld1         {v5.d}[1], [x2], x6
    ld1         {v6.d}[0], [x2], x6
    ld1         {v6.d}[1], [x2], x6
    ld1         {v7.d}[0], [x2], x6
    ld1         {v7.d}[1], [x2]
    ld1         {v16.16b,v17.16b}, [x4], #32
    ld1         {v18.16b,v19.16b}, [x4], #32
    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
    tbl         v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
    tbl         v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
    usubl       v4.8h,  v24.8b,  v28.8b
    usubl2      v5.8h,  v24.16b, v28.16b
    usubl       v6.8h,  v25.8b,  v29.8b
    usubl2      v7.8h,  v25.16b, v29.16b
    usubl       v16.8h, v26.8b,  v30.8b
    usubl2      v17.8h, v26.16b, v30.16b
    usubl       v18.8h, v27.8b,  v31.8b
    usubl2      v19.8h, v27.16b, v31.16b
    umax        v20.8h, v4.8h,   v5.8h
    umax        v21.8h, v6.8h,   v7.8h
    umax        v22.8h, v16.8h,  v17.8h
    umax        v23.8h, v18.8h,  v19.8h
    umax        v20.8h, v20.8h,  v21.8h
    umax        v21.8h, v22.8h,  v23.8h
    umax        v20.8h, v20.8h,  v21.8h
    umaxv       h22,    v20.8h
    st1         {v0.d}[0], [x7], x6
    st1         {v0.d}[1], [x7], x6
    st1         {v1.d}[0], [x7], x6
    st1         {v1.d}[1], [x7], x6
    st1         {v2.d}[0], [x7], x6
    st1         {v2.d}[1], [x7], x6
    st1         {v3.d}[0], [x7], x6
    st1         {v3.d}[1], [x7]
    st1         {v4.8h,v5.8h},   [x0], #32
    st1         {v6.8h,v7.8h},   [x0], #32
    st1         {v16.8h,v17.8h}, [x0], #32
    st1         {v18.8h,v19.8h}, [x0]
    fmov        w9,  s22
    cmp         w9, #0
    cset        w0, ne
    ret
 endfunc
 .endm
 zigzag_sub8x8 field
 zigzag_sub8x8 frame
 #undef T
 #define T(x,y) Z(x*8+y)
 const scan8x8_field, align=5
    .byte T(0,0), T(0,1), T(0,2), T(1,0)
    .byte T(1,1), T(0,3), T(0,4), T(1,2)
    .byte T(2,0), T(1,3), T(0,5), T(0,6)
    .byte T(0,7), T(1,4), T(2,1), T(3,0)
 #undef T
 #define T(x,y) Z((x-1)*8+y)
    .byte T(2,2), T(1,5), T(1,6), T(1,7)
    .byte T(2,3), T(3,1), T(4,0), T(3,2)
 #undef T
 #define T(x,y) Z((x-2)*8+y)
    .byte T(2,4), T(2,5), T(2,6), T(2,7)
    .byte T(3,3), T(4,1), T(5,0), T(4,2)
 #undef T
 #define T(x,y) Z((x-3)*8+y)
    .byte T(3,4), T(3,5), T(3,6), T(3,7)
    .byte T(4,3), T(5,1), T(6,0), T(5,2)
 #undef T
 #define T(x,y) Z((x-4)*8+y)
    .byte T(4,4), T(4,5), T(4,6), T(4,7)
    .byte T(5,3), T(6,1), T(6,2), T(5,4)
 #undef T
 #define T(x,y) Z((x-5)*8+y)
    .byte T(5,5), T(5,6), T(5,7), T(6,3)
    .byte T(7,0), T(7,1), T(6,4), T(6,5)
 endconst
 #undef T
 #define T(y,x) x*8+y
 const sub8x8_frame, align=5
    .byte T(0,0), T(1,0), T(0,1), T(0,2)
    .byte T(1,1), T(2,0), T(3,0), T(2,1)
    .byte T(1,2), T(0,3), T(0,4), T(1,3)
    .byte T(2,2), T(3,1), T(4,0), T(5,0)
    .byte T(4,1), T(3,2), T(2,3), T(1,4)
    .byte T(0,5), T(0,6), T(1,5), T(2,4)
    .byte T(3,3), T(4,2), T(5,1), T(6,0)
    .byte T(7,0), T(6,1), T(5,2), T(4,3)
    .byte T(3,4), T(2,5), T(1,6), T(0,7)
    .byte T(1,7), T(2,6), T(3,5), T(4,4)
    .byte T(5,3), T(6,2), T(7,1), T(7,2)
    .byte T(6,3), T(5,4), T(4,5), T(3,6)
    .byte T(2,7), T(3,7), T(4,6), T(5,5)
    .byte T(6,4), T(7,3), T(7,4), T(6,5)
    .byte T(5,6), T(4,7), T(5,7), T(6,6)
    .byte T(7,5), T(7,6), T(6,7), T(7,7)
 endconst
 const sub8x8_field, align=5
    .byte T(0,0), T(0,1), T(0,2), T(1,0)
    .byte T(1,1), T(0,3), T(0,4), T(1,2)
    .byte T(2,0), T(1,3), T(0,5), T(0,6)
    .byte T(0,7), T(1,4), T(2,1), T(3,0)
    .byte T(2,2), T(1,5), T(1,6), T(1,7)
    .byte T(2,3), T(3,1), T(4,0), T(3,2)
    .byte T(2,4), T(2,5), T(2,6), T(2,7)
    .byte T(3,3), T(4,1), T(5,0), T(4,2)
    .byte T(3,4), T(3,5), T(3,6), T(3,7)
    .byte T(4,3), T(5,1), T(6,0), T(5,2)
    .byte T(4,4), T(4,5), T(4,6), T(4,7)
    .byte T(5,3), T(6,1), T(6,2), T(5,4)
    .byte T(5,5), T(5,6), T(5,7), T(6,3)
    .byte T(7,0), T(7,1), T(6,4), T(6,5)
    .byte T(6,6), T(6,7), T(7,2), T(7,3)
    .byte T(7,4), T(7,5), T(7,6), T(7,7)
 endconst
--- a/common/aarch64/dct.h
+++ b/common/aarch64/dct.h
@@ -0,0 +1,103 @@
 /*****************************************************************************
 * dct.h: aarch64 transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_DCT_H
 #define X264_AARCH64_DCT_H
 #define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
 void x264_dct4x4dc_neon( int16_t d[16] );
 #define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
 void x264_idct4x4dc_neon( int16_t d[16] );
 #define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
 void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
 void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
 void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 #define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
 void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
 #define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
 void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
 #define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
 void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
 #define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
 void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
 #define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
 void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
 #define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
 void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
 void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
 void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
 void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
 #define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
 void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
 #define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
 void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
 #define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
 void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
 #define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon)
 void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
 #define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon)
 void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
 #define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon)
 void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
 #define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon)
 int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
 #define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon)
 int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
 #define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon)
 int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
 #define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon)
 int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
 #define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon)
 int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
 #define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon)
 int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
 #define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
 void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 #define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
 void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
 #define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
 void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
 #define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
 void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 #endif
--- a/common/aarch64/deblock-a-common.S
+++ b/common/aarch64/deblock-a-common.S
@@ -0,0 +1,43 @@
 /*****************************************************************************
 * deblock-a-common.S: aarch64 deblocking
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: Mans Rullgard <mans@mansr.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *          David Chen <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 // This file contains the NEON macros that are intended to be used by
 // the SVE/SVE2 functions as well
 .macro h264_loop_filter_start
    cmp             w2,  #0
    ldr             w6,  [x4]
    ccmp            w3,  #0, #0, ne
    mov             v24.s[0], w6
    and             w8,  w6,  w6,  lsl #16
    b.eq            1f
    ands            w8,  w8,  w8,  lsl #8
    b.ge            2f
 1:
    ret
 2:
 .endm
--- a/common/aarch64/deblock-a-sve.S
+++ b/common/aarch64/deblock-a-sve.S
@@ -0,0 +1,98 @@
 /*****************************************************************************
 * deblock-a-sve.S: aarch64 deblocking
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Chen <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "deblock-a-common.S"
 ENABLE_SVE
 .macro h264_loop_filter_chroma_sve
    ptrue           p0.b, vl16
    dup             v22.16b, w2              // alpha
    uxtl            v24.8h,  v24.8b
    uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
    uxtl            v4.8h,   v0.8b
    uxtl2           v5.8h,   v0.16b
    uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
    usubw           v4.8h,   v4.8h,   v16.8b
    usubw2          v5.8h,   v5.8h,   v16.16b
    sli             v24.8h,  v24.8h,  #8
    shl             v4.8h,   v4.8h,   #2
    shl             v5.8h,   v5.8h,   #2
    uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
    uxtl            v24.4s,  v24.4h
    uaddw           v4.8h,   v4.8h,   v18.8b
    uaddw2          v5.8h,   v5.8h,   v18.16b
    cmphi           p1.b, p0/z, z22.b, z26.b
    usubw           v4.8h,   v4.8h,   v2.8b
    usubw2          v5.8h,   v5.8h,   v2.16b
    sli             v24.4s,  v24.4s,  #16
    dup             v22.16b, w3              // beta
    rshrn           v4.8b,   v4.8h,   #3
    rshrn2          v4.16b,  v5.8h,   #3
    cmphi           p2.b, p0/z, z22.b, z28.b
    cmphi           p3.b, p0/z, z22.b, z30.b
    smin            v4.16b,  v4.16b,  v24.16b
    neg             v25.16b, v24.16b
    and             p1.b, p0/z, p1.b, p2.b
    smax            v4.16b,  v4.16b,  v25.16b
    and             p1.b, p0/z, p1.b, p3.b
    uxtl            v22.8h,  v0.8b
    uxtl2           v23.8h,  v0.16b
    uxtl            v28.8h,  v16.8b
    uxtl2           v29.8h,  v16.16b
    saddw           v28.8h,  v28.8h,  v4.8b
    saddw2          v29.8h,  v29.8h,  v4.16b
    ssubw           v22.8h,  v22.8h,  v4.8b
    ssubw2          v23.8h,  v23.8h,  v4.16b
    sqxtun          v16.8b,  v28.8h
    sqxtun          v0.8b,   v22.8h
    sqxtun2         v16.16b, v29.8h
    sqxtun2         v0.16b,  v23.8h
 .endm
 function deblock_v_chroma_sve, export=1
    h264_loop_filter_start
    sub             x0,  x0,  x1, lsl #1
    // No performance improvement if sve load is used. So, continue using
    // NEON load here
    ld1             {v18.16b}, [x0], x1
    ld1             {v16.16b}, [x0], x1
    ld1             {v0.16b},  [x0], x1
    ld1             {v2.16b},  [x0]
    h264_loop_filter_chroma_sve
    sub             x0,  x0,  x1, lsl #1
    st1b            {z16.b}, p1, [x0]
    add             x0, x0, x1
    st1b            {z0.b}, p1, [x0]
    ret
 endfunc
--- a/common/aarch64/deblock-a.S
+++ b/common/aarch64/deblock-a.S
@@ -0,0 +1,800 @@
 /*****************************************************************************
 * deblock.S: aarch64 deblocking
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: Mans Rullgard <mans@mansr.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "deblock-a-common.S"
 .macro h264_loop_filter_luma
    dup             v22.16b, w2                     // alpha
    uxtl            v24.8h,  v24.8b
    uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
    uxtl            v24.4s,  v24.4h
    uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
    sli             v24.8h,  v24.8h,  #8
    uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
    sli             v24.4s,  v24.4s,  #16
    cmhi            v21.16b, v22.16b, v21.16b       // < alpha
    dup             v22.16b, w3                     // beta
    cmlt            v23.16b, v24.16b, #0
    cmhi            v28.16b, v22.16b, v28.16b       // < beta
    cmhi            v30.16b, v22.16b, v30.16b       // < beta
    bic             v21.16b, v21.16b, v23.16b
    uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
    and             v21.16b, v21.16b, v28.16b
    uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
    cmhi            v17.16b, v22.16b, v17.16b       // < beta
    and             v21.16b, v21.16b, v30.16b
    cmhi            v19.16b, v22.16b, v19.16b       // < beta
    and             v17.16b, v17.16b, v21.16b
    and             v19.16b, v19.16b, v21.16b
    and             v24.16b, v24.16b, v21.16b
    urhadd          v28.16b, v16.16b,  v0.16b
    sub             v21.16b, v24.16b, v17.16b
    uqadd           v23.16b, v18.16b, v24.16b
    uhadd           v20.16b, v20.16b, v28.16b
    sub             v21.16b, v21.16b, v19.16b
    uhadd           v28.16b,  v4.16b, v28.16b
    umin            v23.16b, v23.16b, v20.16b
    uqsub           v22.16b, v18.16b, v24.16b
    uqadd           v4.16b,   v2.16b, v24.16b
    umax            v23.16b, v23.16b, v22.16b
    uqsub           v22.16b,  v2.16b, v24.16b
    umin            v28.16b,  v4.16b, v28.16b
    uxtl            v4.8h,    v0.8b
    umax            v28.16b, v28.16b, v22.16b
    uxtl2           v20.8h,   v0.16b
    usubw           v4.8h,    v4.8h,  v16.8b
    usubw2          v20.8h,  v20.8h,  v16.16b
    shl             v4.8h,    v4.8h,  #2
    shl             v20.8h,  v20.8h,  #2
    uaddw           v4.8h,    v4.8h,  v18.8b
    uaddw2          v20.8h,  v20.8h,  v18.16b
    usubw           v4.8h,    v4.8h,   v2.8b
    usubw2          v20.8h,  v20.8h,   v2.16b
    rshrn           v4.8b,    v4.8h,  #3
    rshrn2          v4.16b,  v20.8h,  #3
    bsl             v17.16b, v23.16b, v18.16b
    bsl             v19.16b, v28.16b,  v2.16b
    neg             v23.16b, v21.16b
    uxtl            v28.8h,  v16.8b
    smin            v4.16b,   v4.16b, v21.16b
    uxtl2           v21.8h,  v16.16b
    smax            v4.16b,   v4.16b, v23.16b
    uxtl            v22.8h,   v0.8b
    uxtl2           v24.8h,   v0.16b
    saddw           v28.8h,  v28.8h,  v4.8b
    saddw2          v21.8h,  v21.8h,  v4.16b
    ssubw           v22.8h,  v22.8h,  v4.8b
    ssubw2          v24.8h,  v24.8h,  v4.16b
    sqxtun          v16.8b,  v28.8h
    sqxtun2         v16.16b, v21.8h
    sqxtun          v0.8b,   v22.8h
    sqxtun2         v0.16b,  v24.8h
 .endm
 function deblock_v_luma_neon, export=1
    h264_loop_filter_start
    ld1             {v0.16b},  [x0], x1
    ld1             {v2.16b},  [x0], x1
    ld1             {v4.16b},  [x0], x1
    sub             x0,  x0,  x1, lsl #2
    sub             x0,  x0,  x1, lsl #1
    ld1             {v20.16b},  [x0], x1
    ld1             {v18.16b},  [x0], x1
    ld1             {v16.16b},  [x0], x1
    h264_loop_filter_luma
    sub             x0,  x0,  x1, lsl #1
    st1             {v17.16b}, [x0], x1
    st1             {v16.16b}, [x0], x1
    st1             {v0.16b},  [x0], x1
    st1             {v19.16b}, [x0]
    ret
 endfunc
 function deblock_h_luma_neon, export=1
    h264_loop_filter_start
    sub             x0,  x0,  #4
    ld1             {v6.8b},  [x0], x1
    ld1             {v20.8b}, [x0], x1
    ld1             {v18.8b}, [x0], x1
    ld1             {v16.8b}, [x0], x1
    ld1             {v0.8b},  [x0], x1
    ld1             {v2.8b},  [x0], x1
    ld1             {v4.8b},  [x0], x1
    ld1             {v26.8b}, [x0], x1
    ld1             {v6.d}[1],  [x0], x1
    ld1             {v20.d}[1], [x0], x1
    ld1             {v18.d}[1], [x0], x1
    ld1             {v16.d}[1], [x0], x1
    ld1             {v0.d}[1],  [x0], x1
    ld1             {v2.d}[1],  [x0], x1
    ld1             {v4.d}[1],  [x0], x1
    ld1             {v26.d}[1], [x0], x1
    transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
    h264_loop_filter_luma
    transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
    sub             x0,  x0,  x1, lsl #4
    add             x0,  x0,  #2
    st1             {v17.s}[0],  [x0], x1
    st1             {v16.s}[0], [x0], x1
    st1             {v0.s}[0],  [x0], x1
    st1             {v19.s}[0], [x0], x1
    st1             {v17.s}[1],  [x0], x1
    st1             {v16.s}[1], [x0], x1
    st1             {v0.s}[1],  [x0], x1
    st1             {v19.s}[1], [x0], x1
    st1             {v17.s}[2],  [x0], x1
    st1             {v16.s}[2], [x0], x1
    st1             {v0.s}[2],  [x0], x1
    st1             {v19.s}[2], [x0], x1
    st1             {v17.s}[3],  [x0], x1
    st1             {v16.s}[3], [x0], x1
    st1             {v0.s}[3],  [x0], x1
    st1             {v19.s}[3], [x0], x1
    ret
 endfunc
 .macro h264_loop_filter_start_intra
    orr             w4,  w2,  w3
    cmp             w4,  #0
    b.ne            1f
    ret
 1:
    dup             v30.16b, w2                // alpha
    dup             v31.16b, w3                // beta
 .endm
 .macro h264_loop_filter_luma_intra
    uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
    uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
    uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
    cmhi            v19.16b, v30.16b, v16.16b       // < alpha
    cmhi            v17.16b, v31.16b, v17.16b       // < beta
    cmhi            v18.16b, v31.16b, v18.16b       // < beta
    movi            v29.16b, #2
    ushr            v30.16b, v30.16b, #2            // alpha >> 2
    add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
    cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
    and             v19.16b, v19.16b, v17.16b
    and             v19.16b, v19.16b, v18.16b
    shrn            v20.8b,  v19.8h,  #4
    mov             x4, v20.d[0]
    cbz             x4, 9f
    ushll           v20.8h,  v6.8b,   #1
    ushll           v22.8h,  v1.8b,   #1
    ushll2          v21.8h,  v6.16b,  #1
    ushll2          v23.8h,  v1.16b,  #1
    uaddw           v20.8h,  v20.8h,  v7.8b
    uaddw           v22.8h,  v22.8h,  v0.8b
    uaddw2          v21.8h,  v21.8h,  v7.16b
    uaddw2          v23.8h,  v23.8h,  v0.16b
    uaddw           v20.8h,  v20.8h,  v1.8b
    uaddw           v22.8h,  v22.8h,  v6.8b
    uaddw2          v21.8h,  v21.8h,  v1.16b
    uaddw2          v23.8h,  v23.8h,  v6.16b
    rshrn           v24.8b,  v20.8h,  #2 // p0'_1
    rshrn           v25.8b,  v22.8h,  #2 // q0'_1
    rshrn2          v24.16b, v21.8h,  #2 // p0'_1
    rshrn2          v25.16b, v23.8h,  #2 // q0'_1
    uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
    uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
    cmhi            v17.16b, v31.16b, v17.16b       // < beta
    cmhi            v18.16b, v31.16b, v18.16b       // < beta
    and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
    and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
    not             v30.16b, v17.16b
    not             v31.16b, v18.16b
    and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
    and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
    and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
    and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
    //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
    uaddl           v26.8h,  v5.8b,   v7.8b
    uaddl2          v27.8h,  v5.16b,  v7.16b
    uaddw           v26.8h,  v26.8h,  v0.8b
    uaddw2          v27.8h,  v27.8h,  v0.16b
    add             v20.8h,  v20.8h,  v26.8h
    add             v21.8h,  v21.8h,  v27.8h
    uaddw           v20.8h,  v20.8h,  v0.8b
    uaddw2          v21.8h,  v21.8h,  v0.16b
    rshrn           v20.8b,  v20.8h,  #3 // p0'_2
    rshrn2          v20.16b, v21.8h,  #3 // p0'_2
    uaddw           v26.8h,  v26.8h,  v6.8b
    uaddw2          v27.8h,  v27.8h,  v6.16b
    rshrn           v21.8b,  v26.8h,  #2 // p1'_2
    rshrn2          v21.16b, v27.8h,  #2 // p1'_2
    uaddl           v28.8h,  v4.8b,   v5.8b
    uaddl2          v29.8h,  v4.16b,  v5.16b
    shl             v28.8h,  v28.8h,  #1
    shl             v29.8h,  v29.8h,  #1
    add             v28.8h,  v28.8h,  v26.8h
    add             v29.8h,  v29.8h,  v27.8h
    rshrn           v19.8b,  v28.8h,  #3 // p2'_2
    rshrn2          v19.16b, v29.8h,  #3 // p2'_2
    //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
    uaddl           v26.8h,  v2.8b,   v0.8b
    uaddl2          v27.8h,  v2.16b,  v0.16b
    uaddw           v26.8h,  v26.8h,  v7.8b
    uaddw2          v27.8h,  v27.8h,  v7.16b
    add             v22.8h,  v22.8h,  v26.8h
    add             v23.8h,  v23.8h,  v27.8h
    uaddw           v22.8h,  v22.8h,  v7.8b
    uaddw2          v23.8h,  v23.8h,  v7.16b
    rshrn           v22.8b,  v22.8h,  #3 // q0'_2
    rshrn2          v22.16b, v23.8h,  #3 // q0'_2
    uaddw           v26.8h,  v26.8h,  v1.8b
    uaddw2          v27.8h,  v27.8h,  v1.16b
    rshrn           v23.8b,  v26.8h,  #2 // q1'_2
    rshrn2          v23.16b, v27.8h,  #2 // q1'_2
    uaddl           v28.8h,  v2.8b,   v3.8b
    uaddl2          v29.8h,  v2.16b,  v3.16b
    shl             v28.8h,  v28.8h,  #1
    shl             v29.8h,  v29.8h,  #1
    add             v28.8h,  v28.8h,  v26.8h
    add             v29.8h,  v29.8h,  v27.8h
    rshrn           v26.8b,  v28.8h,  #3 // q2'_2
    rshrn2          v26.16b, v29.8h,  #3 // q2'_2
    bit             v7.16b,  v24.16b, v30.16b  // p0'_1
    bit             v0.16b,  v25.16b, v31.16b  // q0'_1
    bit             v7.16b, v20.16b,  v17.16b  // p0'_2
    bit             v6.16b, v21.16b,  v17.16b  // p1'_2
    bit             v5.16b, v19.16b,  v17.16b  // p2'_2
    bit             v0.16b, v22.16b,  v18.16b  // q0'_2
    bit             v1.16b, v23.16b,  v18.16b  // q1'_2
    bit             v2.16b, v26.16b,  v18.16b  // q2'_2
 .endm
 function deblock_v_luma_intra_neon, export=1
    h264_loop_filter_start_intra
    ld1             {v0.16b},  [x0], x1 // q0
    ld1             {v1.16b},  [x0], x1 // q1
    ld1             {v2.16b},  [x0], x1 // q2
    ld1             {v3.16b},  [x0], x1 // q3
    sub             x0,  x0,  x1, lsl #3
    ld1             {v4.16b},  [x0], x1 // p3
    ld1             {v5.16b},  [x0], x1 // p2
    ld1             {v6.16b},  [x0], x1 // p1
    ld1             {v7.16b},  [x0]     // p0
    h264_loop_filter_luma_intra
    sub             x0,  x0,  x1, lsl #1
    st1             {v5.16b}, [x0], x1  // p2
    st1             {v6.16b}, [x0], x1  // p1
    st1             {v7.16b}, [x0], x1  // p0
    st1             {v0.16b}, [x0], x1  // q0
    st1             {v1.16b}, [x0], x1  // q1
    st1             {v2.16b}, [x0]      // q2
 9:
    ret
 endfunc
 function deblock_h_luma_intra_neon, export=1
    h264_loop_filter_start_intra
    sub             x0,  x0,  #4
    ld1             {v4.8b},  [x0], x1
    ld1             {v5.8b},  [x0], x1
    ld1             {v6.8b},  [x0], x1
    ld1             {v7.8b},  [x0], x1
    ld1             {v0.8b},  [x0], x1
    ld1             {v1.8b},  [x0], x1
    ld1             {v2.8b},  [x0], x1
    ld1             {v3.8b},  [x0], x1
    ld1             {v4.d}[1],  [x0], x1
    ld1             {v5.d}[1],  [x0], x1
    ld1             {v6.d}[1],  [x0], x1
    ld1             {v7.d}[1],  [x0], x1
    ld1             {v0.d}[1],  [x0], x1
    ld1             {v1.d}[1],  [x0], x1
    ld1             {v2.d}[1],  [x0], x1
    ld1             {v3.d}[1],  [x0], x1
    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
    h264_loop_filter_luma_intra
    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
    sub             x0,  x0,  x1, lsl #4
    st1             {v4.8b},  [x0], x1
    st1             {v5.8b},  [x0], x1
    st1             {v6.8b},  [x0], x1
    st1             {v7.8b},  [x0], x1
    st1             {v0.8b},  [x0], x1
    st1             {v1.8b},  [x0], x1
    st1             {v2.8b},  [x0], x1
    st1             {v3.8b},  [x0], x1
    st1             {v4.d}[1],  [x0], x1
    st1             {v5.d}[1],  [x0], x1
    st1             {v6.d}[1],  [x0], x1
    st1             {v7.d}[1],  [x0], x1
    st1             {v0.d}[1],  [x0], x1
    st1             {v1.d}[1],  [x0], x1
    st1             {v2.d}[1],  [x0], x1
    st1             {v3.d}[1],  [x0], x1
 9:
    ret
 endfunc
 .macro h264_loop_filter_chroma
    dup             v22.16b, w2              // alpha
    uxtl            v24.8h,  v24.8b
    uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
    uxtl            v4.8h,   v0.8b
    uxtl2           v5.8h,   v0.16b
    uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
    usubw           v4.8h,   v4.8h,   v16.8b
    usubw2          v5.8h,   v5.8h,   v16.16b
    sli             v24.8h,  v24.8h,  #8
    shl             v4.8h,   v4.8h,   #2
    shl             v5.8h,   v5.8h,   #2
    uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
    uxtl            v24.4s,  v24.4h
    uaddw           v4.8h,   v4.8h,   v18.8b
    uaddw2          v5.8h,   v5.8h,   v18.16b
    cmhi            v26.16b, v22.16b, v26.16b  // < alpha
    usubw           v4.8h,   v4.8h,   v2.8b
    usubw2          v5.8h,   v5.8h,   v2.16b
    sli             v24.4s,  v24.4s,  #16
    dup             v22.16b, w3              // beta
    rshrn           v4.8b,   v4.8h,   #3
    rshrn2          v4.16b,  v5.8h,   #3
    cmhi            v28.16b, v22.16b, v28.16b  // < beta
    cmhi            v30.16b, v22.16b, v30.16b  // < beta
    smin            v4.16b,  v4.16b,  v24.16b
    neg             v25.16b, v24.16b
    and             v26.16b, v26.16b, v28.16b
    smax            v4.16b,  v4.16b,  v25.16b
    and             v26.16b, v26.16b, v30.16b
    uxtl            v22.8h,  v0.8b
    uxtl2           v23.8h,  v0.16b
    and             v4.16b,  v4.16b,  v26.16b
    uxtl            v28.8h,  v16.8b
    uxtl2           v29.8h,  v16.16b
    saddw           v28.8h,  v28.8h,  v4.8b
    saddw2          v29.8h,  v29.8h,  v4.16b
    ssubw           v22.8h,  v22.8h,  v4.8b
    ssubw2          v23.8h,  v23.8h,  v4.16b
    sqxtun          v16.8b,  v28.8h
    sqxtun          v0.8b,   v22.8h
    sqxtun2         v16.16b, v29.8h
    sqxtun2         v0.16b,  v23.8h
 .endm
 function deblock_v_chroma_neon, export=1
    h264_loop_filter_start
    sub             x0,  x0,  x1, lsl #1
    ld1             {v18.16b}, [x0], x1
    ld1             {v16.16b}, [x0], x1
    ld1             {v0.16b},  [x0], x1
    ld1             {v2.16b},  [x0]
    h264_loop_filter_chroma
    sub             x0,  x0,  x1, lsl #1
    st1             {v16.16b}, [x0], x1
    st1             {v0.16b},  [x0], x1
    ret
 endfunc
 function deblock_h_chroma_neon, export=1
    h264_loop_filter_start
    sub             x0,  x0,  #4
 deblock_h_chroma:
    ld1             {v18.d}[0], [x0], x1
    ld1             {v16.d}[0], [x0], x1
    ld1             {v0.d}[0],  [x0], x1
    ld1             {v2.d}[0],  [x0], x1
    ld1             {v18.d}[1], [x0], x1
    ld1             {v16.d}[1], [x0], x1
    ld1             {v0.d}[1],  [x0], x1
    ld1             {v2.d}[1],  [x0], x1
    transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
    h264_loop_filter_chroma
    transpose4x8.h  v18, v16, v0, v2, v28, v29, v30, v31
    sub             x0,  x0,  x1, lsl #3
    st1             {v18.d}[0], [x0], x1
    st1             {v16.d}[0], [x0], x1
    st1             {v0.d}[0],  [x0], x1
    st1             {v2.d}[0],  [x0], x1
    st1             {v18.d}[1], [x0], x1
    st1             {v16.d}[1], [x0], x1
    st1             {v0.d}[1],  [x0], x1
    st1             {v2.d}[1],  [x0], x1
    ret
 endfunc
 function deblock_h_chroma_422_neon, export=1
    add             x5,  x0,  x1
    sub             x0,  x0,  #4
    add             x1,  x1,  x1
    h264_loop_filter_start
    mov             x7,  x30
    bl              deblock_h_chroma
    mov             x30, x7
    sub             x0,  x5,  #4
    mov             v24.s[0], w6
    b               deblock_h_chroma
 endfunc
 .macro h264_loop_filter_chroma8
    dup             v22.8b,  w2                 // alpha
    uxtl            v24.8h,  v24.8b
    uabd            v26.8b,  v16.8b,  v17.8b    // abs(p0 - q0)
    uxtl            v4.8h,   v17.8b
    uabd            v28.8b,  v18.8b,  v16.8b    // abs(p1 - p0)
    usubw           v4.8h,   v4.8h,   v16.8b
    sli             v24.8h,  v24.8h,  #8
    shl             v4.8h,   v4.8h,   #2
    uabd            v30.8b,  v19.8b,  v17.8b    // abs(q1 - q0)
    uaddw           v4.8h,   v4.8h,   v18.8b
    cmhi            v26.8b,  v22.8b,  v26.8b    // < alpha
    usubw           v4.8h,   v4.8h,   v19.8b
    dup             v22.8b,  w3                 // beta
    rshrn           v4.8b,   v4.8h,   #3
    cmhi            v28.8b,  v22.8b,  v28.8b    // < beta
    cmhi            v30.8b,  v22.8b,  v30.8b    // < beta
    smin            v4.8b,   v4.8b,   v24.8b
    neg             v25.8b,  v24.8b
    and             v26.8b,  v26.8b,  v28.8b
    smax            v4.8b,   v4.8b,   v25.8b
    and             v26.8b,  v26.8b,  v30.8b
    uxtl            v22.8h,  v17.8b
    and             v4.8b,   v4.8b,   v26.8b
    uxtl            v28.8h,  v16.8b
    saddw           v28.8h,  v28.8h,  v4.8b
    ssubw           v22.8h,  v22.8h,  v4.8b
    sqxtun          v16.8b,  v28.8h
    sqxtun          v17.8b,  v22.8h
 .endm
 function deblock_h_chroma_mbaff_neon, export=1
    h264_loop_filter_start
    sub             x4,  x0,  #4
    sub             x0,  x0,  #2
    ld1             {v18.8b}, [x4], x1
    ld1             {v16.8b}, [x4], x1
    ld1             {v17.8b},  [x4], x1
    ld1             {v19.8b},  [x4]
    transpose4x4.h  v18, v16, v17, v19, v28, v29, v30, v31
    h264_loop_filter_chroma8
    st2             {v16.h,v17.h}[0], [x0], x1
    st2             {v16.h,v17.h}[1], [x0], x1
    st2             {v16.h,v17.h}[2], [x0], x1
    st2             {v16.h,v17.h}[3], [x0]
    ret
 endfunc
 .macro h264_loop_filter_chroma_intra width=16
    uabd            v26.16b, v16.16b, v17.16b  // abs(p0 - q0)
    uabd            v27.16b, v18.16b, v16.16b  // abs(p1 - p0)
    uabd            v28.16b, v19.16b, v17.16b  // abs(q1 - q0)
    cmhi            v26.16b, v30.16b, v26.16b  // < alpha
    cmhi            v27.16b, v31.16b, v27.16b  // < beta
    cmhi            v28.16b, v31.16b, v28.16b  // < beta
    and             v26.16b, v26.16b, v27.16b
    and             v26.16b, v26.16b, v28.16b
    ushll           v4.8h,   v18.8b,  #1
    ushll           v6.8h,   v19.8b,  #1
 .ifc \width, 16
    ushll2          v5.8h,   v18.16b, #1
    ushll2          v7.8h,   v19.16b, #1
    uaddl2          v21.8h,  v16.16b, v19.16b
    uaddl2          v23.8h,  v17.16b, v18.16b
 .endif
    uaddl           v20.8h,  v16.8b,  v19.8b
    uaddl           v22.8h,  v17.8b,  v18.8b
    add             v20.8h,  v20.8h,  v4.8h     // mlal?
    add             v22.8h,  v22.8h,  v6.8h
 .ifc \width, 16
    add             v21.8h,  v21.8h,  v5.8h
    add             v23.8h,  v23.8h,  v7.8h
 .endif
    uqrshrn         v24.8b,  v20.8h,  #2
    uqrshrn         v25.8b,  v22.8h,  #2
 .ifc \width, 16
    uqrshrn2        v24.16b, v21.8h,  #2
    uqrshrn2        v25.16b, v23.8h,  #2
 .endif
    bit             v16.16b, v24.16b, v26.16b
    bit             v17.16b, v25.16b, v26.16b
 .endm
 function deblock_v_chroma_intra_neon, export=1
    h264_loop_filter_start_intra
    sub             x0,  x0,  x1, lsl #1
    ld1             {v18.16b}, [x0], x1
    ld1             {v16.16b}, [x0], x1
    ld1             {v17.16b}, [x0], x1
    ld1             {v19.16b}, [x0]
    h264_loop_filter_chroma_intra
    sub             x0,  x0,  x1, lsl #1
    st1             {v16.16b}, [x0], x1
    st1             {v17.16b}, [x0], x1
    ret
 endfunc
 function deblock_h_chroma_intra_mbaff_neon, export=1
    h264_loop_filter_start_intra
    sub             x4,  x0,  #4
    sub             x0,  x0,  #2
    ld1             {v18.8b}, [x4], x1
    ld1             {v16.8b}, [x4], x1
    ld1             {v17.8b}, [x4], x1
    ld1             {v19.8b}, [x4], x1
    transpose4x4.h  v18, v16, v17, v19, v26, v27, v28, v29
    h264_loop_filter_chroma_intra width=8
    st2             {v16.h,v17.h}[0], [x0], x1
    st2             {v16.h,v17.h}[1], [x0], x1
    st2             {v16.h,v17.h}[2], [x0], x1
    st2             {v16.h,v17.h}[3], [x0], x1
    ret
 endfunc
 function deblock_h_chroma_intra_neon, export=1
    h264_loop_filter_start_intra
    sub             x4,  x0,  #4
    sub             x0,  x0,  #2
    ld1             {v18.d}[0], [x4], x1
    ld1             {v16.d}[0], [x4], x1
    ld1             {v17.d}[0], [x4], x1
    ld1             {v19.d}[0], [x4], x1
    ld1             {v18.d}[1], [x4], x1
    ld1             {v16.d}[1], [x4], x1
    ld1             {v17.d}[1], [x4], x1
    ld1             {v19.d}[1], [x4], x1
    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
    h264_loop_filter_chroma_intra
    st2             {v16.h,v17.h}[0], [x0], x1
    st2             {v16.h,v17.h}[1], [x0], x1
    st2             {v16.h,v17.h}[2], [x0], x1
    st2             {v16.h,v17.h}[3], [x0], x1
    st2             {v16.h,v17.h}[4], [x0], x1
    st2             {v16.h,v17.h}[5], [x0], x1
    st2             {v16.h,v17.h}[6], [x0], x1
    st2             {v16.h,v17.h}[7], [x0], x1
    ret
 endfunc
 function deblock_h_chroma_422_intra_neon, export=1
    h264_loop_filter_start_intra
    sub             x4,  x0,  #4
    sub             x0,  x0,  #2
    ld1             {v18.d}[0], [x4], x1
    ld1             {v16.d}[0], [x4], x1
    ld1             {v17.d}[0], [x4], x1
    ld1             {v19.d}[0], [x4], x1
    ld1             {v18.d}[1], [x4], x1
    ld1             {v16.d}[1], [x4], x1
    ld1             {v17.d}[1], [x4], x1
    ld1             {v19.d}[1], [x4], x1
    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
    h264_loop_filter_chroma_intra
    st2             {v16.h,v17.h}[0], [x0], x1
    st2             {v16.h,v17.h}[1], [x0], x1
    st2             {v16.h,v17.h}[2], [x0], x1
    st2             {v16.h,v17.h}[3], [x0], x1
    st2             {v16.h,v17.h}[4], [x0], x1
    st2             {v16.h,v17.h}[5], [x0], x1
    st2             {v16.h,v17.h}[6], [x0], x1
    st2             {v16.h,v17.h}[7], [x0], x1
    ld1             {v18.d}[0], [x4], x1
    ld1             {v16.d}[0], [x4], x1
    ld1             {v17.d}[0], [x4], x1
    ld1             {v19.d}[0], [x4], x1
    ld1             {v18.d}[1], [x4], x1
    ld1             {v16.d}[1], [x4], x1
    ld1             {v17.d}[1], [x4], x1
    ld1             {v19.d}[1], [x4], x1
    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
    h264_loop_filter_chroma_intra
    st2             {v16.h,v17.h}[0], [x0], x1
    st2             {v16.h,v17.h}[1], [x0], x1
    st2             {v16.h,v17.h}[2], [x0], x1
    st2             {v16.h,v17.h}[3], [x0], x1
    st2             {v16.h,v17.h}[4], [x0], x1
    st2             {v16.h,v17.h}[5], [x0], x1
    st2             {v16.h,v17.h}[6], [x0], x1
    st2             {v16.h,v17.h}[7], [x0], x1
    ret
 endfunc
 // void deblock_strength( uint8_t nnz[X264_SCAN8_SIZE],
 //                        int8_t ref[2][X264_SCAN8_LUMA_SIZE],
 //                        int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
 //                        uint8_t bs[2][8][4], int mvy_limit,
 //                        int bframe )
 function deblock_strength_neon, export=1
    movi        v4.16b, #0
    lsl         w4,  w4,  #8
    add         x3,  x3,  #32
    sub         w4,  w4,  #(1<<8)-3
    movi        v5.16b, #0
    dup         v6.8h,  w4
    mov         x6,  #-32
 bframe:
    // load bytes ref
    add         x2,  x2,  #16
    ld1         {v31.d}[1], [x1], #8
    ld1         {v1.16b}, [x1], #16
    movi        v0.16b,  #0
    ld1         {v2.16b}, [x1], #16
    ext         v3.16b,  v0.16b,  v1.16b,  #15
    ext         v0.16b,  v0.16b,  v2.16b,  #15
    unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
    unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
    ext         v21.16b, v31.16b, v22.16b, #12
    eor         v0.16b,  v20.16b, v22.16b
    eor         v1.16b,  v21.16b, v22.16b
    orr         v4.16b,  v4.16b,  v0.16b
    orr         v5.16b,  v5.16b,  v1.16b
    ld1         {v21.8h}, [x2], #16      // mv + 0x10
    ld1         {v19.8h}, [x2], #16      // mv + 0x20
    ld1         {v22.8h}, [x2], #16      // mv + 0x30
    ld1         {v18.8h}, [x2], #16      // mv + 0x40
    ld1         {v23.8h}, [x2], #16      // mv + 0x50
    ext         v19.16b, v19.16b, v22.16b, #12
    ext         v18.16b, v18.16b, v23.16b, #12
    sabd        v0.8h,   v22.8h,  v19.8h
    ld1         {v19.8h}, [x2], #16      // mv + 0x60
    sabd        v1.8h,   v23.8h,  v18.8h
    ld1         {v24.8h}, [x2], #16      // mv + 0x70
    uqxtn       v0.8b,   v0.8h
    ld1         {v18.8h}, [x2], #16      // mv + 0x80
    ld1         {v25.8h}, [x2], #16      // mv + 0x90
    uqxtn2      v0.16b,  v1.8h
    ext         v19.16b, v19.16b, v24.16b, #12
    ext         v18.16b, v18.16b, v25.16b, #12
    sabd        v1.8h,   v24.8h,  v19.8h
    sabd        v2.8h,   v25.8h,  v18.8h
    uqxtn       v1.8b,   v1.8h
    uqxtn2      v1.16b,  v2.8h
    uqsub       v0.16b,  v0.16b,  v6.16b
    uqsub       v1.16b,  v1.16b,  v6.16b
    uqxtn       v0.8b,   v0.8h
    uqxtn2      v0.16b,  v1.8h
    sabd        v1.8h,   v22.8h,  v23.8h
    orr         v4.16b,  v4.16b,  v0.16b
    sabd        v0.8h,   v21.8h,  v22.8h
    sabd        v2.8h,   v23.8h,  v24.8h
    sabd        v3.8h,   v24.8h,  v25.8h
    uqxtn       v0.8b,   v0.8h
    uqxtn2      v0.16b,  v1.8h
    uqxtn       v1.8b,   v2.8h
    uqxtn2      v1.16b,  v3.8h
    uqsub       v0.16b,  v0.16b,  v6.16b
    uqsub       v1.16b,  v1.16b,  v6.16b
    uqxtn       v0.8b,   v0.8h
    uqxtn2      v0.16b,  v1.8h
    subs        w5,  w5,  #1
    orr         v5.16b,  v5.16b,  v0.16b
    b.eq        bframe
    movi        v6.16b, #1
    // load bytes nnz
    ld1         {v31.d}[1], [x0], #8
    ld1         {v1.16b}, [x0], #16
    movi        v0.16b,  #0
    ld1         {v2.16b}, [x0], #16
    ext         v3.16b,  v0.16b,  v1.16b,  #15
    ext         v0.16b,  v0.16b,  v2.16b,  #15
    unzip       v21.4s,  v22.4s,  v1.4s,   v2.4s
    unzip       v23.4s,  v20.4s,  v3.4s,   v0.4s
    ext         v21.16b, v31.16b, v22.16b, #12
    movrel      x7,  transpose_table
    ld1         {v7.16b}, [x7]
    orr         v0.16b,  v20.16b, v22.16b
    orr         v1.16b,  v21.16b, v22.16b
    umin        v0.16b,  v0.16b,  v6.16b
    umin        v1.16b,  v1.16b,  v6.16b
    umin        v4.16b,  v4.16b,  v6.16b        // mv ? 1 : 0
    umin        v5.16b,  v5.16b,  v6.16b
    add         v0.16b,  v0.16b,  v0.16b        // nnz ? 2 : 0
    add         v1.16b,  v1.16b,  v1.16b
    umax        v4.16b,  v4.16b,  v0.16b
    umax        v5.16b,  v5.16b,  v1.16b
    tbl         v6.16b, {v4.16b}, v7.16b
    st1         {v5.16b}, [x3], x6       // bs[1]
    st1         {v6.16b}, [x3]           // bs[0]
    ret
 endfunc
 const transpose_table
    .byte 0, 4,  8, 12
    .byte 1, 5,  9, 13
    .byte 2, 6, 10, 14
    .byte 3, 7, 11, 15
 endconst
--- a/common/aarch64/deblock.h
+++ b/common/aarch64/deblock.h
@@ -0,0 +1,61 @@
 /*****************************************************************************
 * deblock.h: aarch64 deblocking
 *****************************************************************************
 * Copyright (C) 2017-2025 x264 project
 *
 * Authors: Anton Mitrofanov <BugMaster@narod.ru>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_DEBLOCK_H
 #define X264_AARCH64_DEBLOCK_H
 #define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
 void x264_deblock_v_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
 void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
 void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
 void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_strength_neon x264_template(deblock_strength_neon)
 void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                 int mvy_limit, int bframe );
 #define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
 void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
 void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
 void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
 void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
 void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
 void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
 void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
 void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
 void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #endif
--- a/common/aarch64/mc-a-common.S
+++ b/common/aarch64/mc-a-common.S
@@ -0,0 +1,66 @@
 /****************************************************************************
 * mc-a-common.S: aarch64 motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *          Mans Rullgard <mans@mansr.com>
 *          Stefan Groenroos <stefan.gronroos@gmail.com>
 *          David Chen   <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 // This file contains the NEON macros and functions that are intended to be used by
 // the SVE/SVE2 functions as well
 #if BIT_DEPTH == 8
 // 0 < weight < 64
 .macro load_weights_add_add
    mov         w6,  w6
 .endm
 // weight > 64
 .macro load_weights_add_sub
    neg         w7,  w7
 .endm
 // weight < 0
 .macro load_weights_sub_add
    neg         w6,  w6
 .endm
 function pixel_avg_w4_neon
 1:  subs        w9,  w9,  #2
    ld1         {v0.s}[0], [x2], x3
    ld1         {v2.s}[0], [x4], x5
    urhadd      v0.8b,  v0.8b,  v2.8b
    ld1         {v1.s}[0], [x2], x3
    ld1         {v3.s}[0], [x4], x5
    urhadd      v1.8b,  v1.8b,  v3.8b
    st1         {v0.s}[0], [x0], x1
    st1         {v1.s}[0], [x0], x1
    b.gt        1b
    ret
 endfunc
 #else // BIT_DEPTH == 10
 #endif
--- a/common/aarch64/mc-a-sve.S
+++ b/common/aarch64/mc-a-sve.S
@@ -0,0 +1,108 @@
 /*****************************************************************************
 * mc-a-sve.S: aarch64 motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Chen <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "mc-a-common.S"
 ENABLE_SVE
 #if BIT_DEPTH == 8
 // void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
 //                 uint8_t *src1, intptr_t src1_stride,
 //                 uint8_t *src2, intptr_t src2_stride, int weight );
 .macro AVGH_SVE w h
 function pixel_avg_\w\()x\h\()_sve, export=1
    mov         w10, #64
    cmp         w6,  #32
    mov         w9, #\h
    b.eq        pixel_avg_w\w\()_neon
    subs        w7,  w10,  w6
    b.lt        pixel_avg_weight_w\w\()_add_sub_sve     // weight > 64
    cmp         w6,  #0
    b.ge        pixel_avg_weight_w\w\()_add_add_sve
    b           pixel_avg_weight_w\w\()_sub_add_sve     // weight < 0
 endfunc
 .endm
 AVGH_SVE  4, 2
 AVGH_SVE  4, 4
 AVGH_SVE  4, 8
 AVGH_SVE  4, 16
 // 0 < weight < 64
 .macro weight_add_add_sve dst, s1, s2, h=
    mul         \dst, \s1, v30.8h
    mla         \dst, \s2, v31.8h
 .endm
 // weight > 64
 .macro weight_add_sub_sve dst, s1, s2, h=
    mul         \dst, \s1, v30.8h
    mls         \dst, \s2, v31.8h
 .endm
 // weight < 0
 .macro weight_sub_add_sve dst, s1, s2, h=
    mul         \dst, \s2, v31.8h
    mls         \dst, \s1, v30.8h
 .endm
 .macro AVG_WEIGHT_SVE ext
 function pixel_avg_weight_w4_\ext\()_sve
    load_weights_\ext
    ptrue       p0.b, vl8
    dup         v30.8h, w6
    dup         v31.8h, w7
 1:  // height loop
    subs        w9,  w9,  #2
    ld1b        {z0.h}, p0/z, [x2]
    add         x2, x2, x3
    ld1b        {z1.h}, p0/z, [x4]
    add         x4, x4, x5
    weight_\ext\()_sve v4.8h,  v0.8h,  v1.8h
    ld1b        {z2.h}, p0/z, [x2]
    add         x2, x2, x3
    ld1b        {z3.h}, p0/z, [x4]
    add         x4, x4, x5
    sqrshrun    v0.8b,  v4.8h,  #6
    weight_\ext\()_sve v5.8h,  v2.8h,  v3.8h
    st1         {v0.s}[0], [x0], x1
    sqrshrun    v1.8b,  v5.8h,  #6
    st1         {v1.s}[0], [x0], x1
    b.gt        1b
    ret
 endfunc
 .endm
 AVG_WEIGHT_SVE add_add
 AVG_WEIGHT_SVE add_sub
 AVG_WEIGHT_SVE sub_add
 #else // BIT_DEPTH == 10
 #endif
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -0,0 +1,371 @@
 /*****************************************************************************
 * mc-c.c: aarch64 motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "mc.h"
 #define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
 void x264_prefetch_ref_aarch64( pixel *, intptr_t, int );
 #define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
 void x264_prefetch_fenc_420_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
 void x264_prefetch_fenc_422_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
 void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
 #define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
 void x264_memzero_aligned_neon( void *dst, size_t n );
 #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
 void x264_pixel_avg_16x16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
 void x264_pixel_avg_16x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
 void x264_pixel_avg_8x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
 void x264_pixel_avg_8x8_neon  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
 void x264_pixel_avg_8x4_neon  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
 void x264_pixel_avg_4x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
 void x264_pixel_avg_4x8_neon  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
 void x264_pixel_avg_4x4_neon  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
 void x264_pixel_avg_4x2_neon  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve)
 void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve)
 void x264_pixel_avg_4x8_sve  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve)
 void x264_pixel_avg_4x4_sve  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve)
 void x264_pixel_avg_4x2_sve  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
 void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
 #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
 void x264_pixel_avg2_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
 #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
 void x264_pixel_avg2_w16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
 #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
 void x264_pixel_avg2_w20_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
 #define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
 void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
                                pixel *src, intptr_t i_src, int w, int h );
 #define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
 void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
                                     pixel *src, intptr_t i_src, int w, int h );
 #define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                         pixel *dstv, intptr_t i_dstv,
                                         pixel *src,  intptr_t i_src, int w, int h );
 #define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
 void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
                                            pixel *dstb, intptr_t i_dstb,
                                            pixel *dstc, intptr_t i_dstc,
                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
 #define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
 void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
                                           pixel *srcu, intptr_t i_srcu,
                                           pixel *srcv, intptr_t i_srcv, int w, int h );
 #define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
 void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 #define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
 void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
 void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
 #define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
 #define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
 #define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
 #define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
 #define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
 #define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
 #define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
 #define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
 #define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
 #define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
 #define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
 #define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
 #define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
 #define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
 #define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
 #define MC_WEIGHT(func)\
 void x264_mc_weight_w20##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w16##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w8##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w4##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
 \
 static void (* mc##func##_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
 {\
    x264_mc_weight_w4##func##_neon,\
    x264_mc_weight_w4##func##_neon,\
    x264_mc_weight_w8##func##_neon,\
    x264_mc_weight_w16##func##_neon,\
    x264_mc_weight_w16##func##_neon,\
    x264_mc_weight_w20##func##_neon,\
 };
 MC_WEIGHT()
 MC_WEIGHT(_nodenom)
 MC_WEIGHT(_offsetadd)
 MC_WEIGHT(_offsetsub)
 #define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
 void x264_mc_copy_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
 void x264_mc_copy_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
 void x264_mc_copy_w16_neon( pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_mc_chroma_neon x264_template(mc_chroma_neon)
 void x264_mc_chroma_neon( pixel *, pixel *, intptr_t, pixel *, intptr_t, int, int, int, int );
 #define x264_integral_init4h_neon x264_template(integral_init4h_neon)
 void x264_integral_init4h_neon( uint16_t *, pixel *, intptr_t );
 #define x264_integral_init4v_neon x264_template(integral_init4v_neon)
 void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
 #define x264_integral_init8h_neon x264_template(integral_init8h_neon)
 void x264_integral_init8h_neon( uint16_t *, pixel *, intptr_t );
 #define x264_integral_init8v_neon x264_template(integral_init8v_neon)
 void x264_integral_init8v_neon( uint16_t *, intptr_t );
 #define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
 void x264_frame_init_lowres_core_neon( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, intptr_t, int, int );
 #define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
 void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
 #define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
 void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
 #define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
 void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
 static void (* const pixel_avg_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ) =
 {
    NULL,
    x264_pixel_avg2_w4_neon,
    x264_pixel_avg2_w8_neon,
    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
    x264_pixel_avg2_w16_neon,
    x264_pixel_avg2_w20_neon,
 };
 static void (* const mc_copy_wtab_neon[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =
 {
    NULL,
    x264_mc_copy_w4_neon,
    x264_mc_copy_w8_neon,
    NULL,
    x264_mc_copy_w16_neon,
 };
 static void weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
        {
            w->weightfn = mc_offsetsub_wtab_neon;
            w->cachea[0] = -w->i_offset;
        }
        else
        {
            w->weightfn = mc_offsetadd_wtab_neon;
            w->cachea[0] = w->i_offset;
        }
    }
    else if( !w->i_denom )
        w->weightfn = mc_nodenom_wtab_neon;
    else
        w->weightfn = mc_wtab_neon;
 }
 static void mc_luma_neon( pixel *dst,    intptr_t i_dst_stride,
                          pixel *src[4], intptr_t i_src_stride,
                          int mvx, int mvy,
                          int i_width, int i_height, const x264_weight_t *weight )
 {
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
    if( (mvy&3) == 3 )             // explicit if() to force conditional add
        src1 += i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg_wtab_neon[i_width>>2](
                dst, i_dst_stride, src1, i_src_stride,
                src2, i_height );
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
    }
    else if( weight->weightfn )
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
    else
        mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
 }
 static pixel *get_ref_neon( pixel *dst,   intptr_t *i_dst_stride,
                              pixel *src[4], intptr_t i_src_stride,
                              int mvx, int mvy,
                              int i_width, int i_height, const x264_weight_t *weight )
 {
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
    if( (mvy&3) == 3 )             // explicit if() to force conditional add
        src1 += i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg_wtab_neon[i_width>>2](
                dst, *i_dst_stride, src1, i_src_stride,
                src2, i_height );
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
 }
 #define x264_hpel_filter_neon x264_template(hpel_filter_neon)
 void x264_hpel_filter_neon( pixel *dsth, pixel *dstv, pixel *dstc,
                            pixel *src, intptr_t stride, int width,
                            int height, int16_t *buf );
 #if !HIGH_BIT_DEPTH && HAVE_I8MM
 #define x264_hpel_filter_neon_i8mm x264_template(hpel_filter_neon_i8mm)
 void x264_hpel_filter_neon_i8mm( pixel *dsth, pixel *dstv, pixel *dstc,
                                 pixel *src, intptr_t stride, int width,
                                 int height, int16_t *buf );
 #endif // !HIGH_BIT_DEPTH && HAVE_I8MM
 PLANE_COPY(16, neon)
 PLANE_COPY_SWAP(16, neon)
 PLANE_INTERLEAVE(neon)
 PROPAGATE_LIST(neon)
 void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf )
 {
    if( cpu&X264_CPU_ARMV8 )
    {
        pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
        pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
        pf->prefetch_ref      = x264_prefetch_ref_aarch64;
    }
    if( cpu&X264_CPU_NEON )
    {
        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
        pf->mbtree_propagate_list = mbtree_propagate_list_neon;
        pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
        pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
        pf->memcpy_aligned  = x264_memcpy_aligned_neon;
        pf->memzero_aligned = x264_memzero_aligned_neon;
        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
        pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
        pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
        pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
        pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
        pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
        pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
        pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
        pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
        pf->copy[PIXEL_16x16]    = x264_mc_copy_w16_neon;
        pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
        pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
        pf->weight       = mc_wtab_neon;
        pf->offsetadd    = mc_offsetadd_wtab_neon;
        pf->offsetsub    = mc_offsetsub_wtab_neon;
        pf->weight_cache = weight_cache_neon;
        pf->mc_chroma = x264_mc_chroma_neon;
        pf->mc_luma = mc_luma_neon;
        pf->get_ref = get_ref_neon;
        pf->integral_init4h = x264_integral_init4h_neon;
        pf->integral_init8h = x264_integral_init8h_neon;
        pf->integral_init4v = x264_integral_init4v_neon;
        pf->integral_init8v = x264_integral_init8v_neon;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
        pf->store_interleave_chroma       = x264_store_interleave_chroma_neon;
        pf->plane_copy                  = plane_copy_neon;
        pf->plane_copy_swap             = plane_copy_swap_neon;
        pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
        pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
        pf->plane_copy_interleave       = plane_copy_interleave_neon;
        pf->hpel_filter = x264_hpel_filter_neon;
    }
 #if !HIGH_BIT_DEPTH
 #if HAVE_SVE
    if( cpu&X264_CPU_SVE )
    {
        pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_sve;
        pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_sve;
        pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sve;
        pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sve;
    }
 #endif
 #if HAVE_I8MM
    if( cpu&X264_CPU_I8MM )
    {
        pf->hpel_filter = x264_hpel_filter_neon_i8mm;
    }
 #endif // HAVE_I8MM
 #endif // !HIGH_BIT_DEPTH
 }
--- a/common/aarch64/mc.h
+++ b/common/aarch64/mc.h
@@ -0,0 +1,32 @@
 /*****************************************************************************
 * mc.h: aarch64 motion compensation
 *****************************************************************************
 * Copyright (C) 2014-2025 x264 project
 *
 * Authors: Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_MC_H
 #define X264_AARCH64_MC_H
 #define x264_mc_init_aarch64 x264_template(mc_init_aarch64)
 void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf );
 #endif
--- a/common/aarch64/pixel-a-common.S
+++ b/common/aarch64/pixel-a-common.S
@@ -0,0 +1,44 @@
 /****************************************************************************
 * pixel-a-common.S: aarch64 pixel metrics
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *          David Chen   <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 // This file contains the NEON macros and constants that are intended to be used by
 // the SVE/SVE2 functions as well
 const mask_ac_4_8
 .short 0, -1, -1, -1,  0, -1, -1, -1
 .short 0, -1, -1, -1, -1, -1, -1, -1
 endconst
 .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
    SUMSUB_AB   \s1, \d1, \a, \b
    SUMSUB_AB   \s2, \d2, \c, \d
 .endm
 .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
 .endm
--- a/common/aarch64/pixel-a-sve.S
+++ b/common/aarch64/pixel-a-sve.S
@@ -0,0 +1,523 @@
 /*****************************************************************************
 * pixel-a-sve.S: aarch64 pixel metrics
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Chen <david.chen@myais.com.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 #include "pixel-a-common.S"
 ENABLE_SVE
 #if BIT_DEPTH == 8
 .macro SSD_START_SVE_4
    ptrue       p0.h, vl4
    ld1b        {z16.h}, p0/z, [x0]
    ld1b        {z17.h}, p0/z, [x2]
    add         x0, x0, x1
    add         x2, x2, x3
    sub         v2.4h, v16.4h, v17.4h
    ld1b        {z16.h}, p0/z, [x0]
    ld1b        {z17.h}, p0/z, [x2]
    add         x0, x0, x1
    add         x2, x2, x3
    smull       v0.4s,  v2.4h,   v2.4h
 .endm
 .macro SSD_SVE_4
    sub         v2.4h, v16.4h, v17.4h
    ld1b        {z16.h}, p0/z, [x0]
    ld1b        {z17.h}, p0/z, [x2]
    add         x0, x0, x1
    add         x2, x2, x3
    smlal       v0.4s,  v2.4h,   v2.4h
 .endm
 .macro SSD_END_SVE_4
    sub         v2.4h, v16.4h, v17.4h
    smlal       v0.4s,  v2.4h,   v2.4h
 .endm
 .macro SSD_START_SVE_8
    ptrue       p0.h, vl8
    ld1b        {z16.h}, p0/z, [x0]
    ld1b        {z17.h}, p0/z, [x2]
    add         x0, x0, x1
    add         x2, x2, x3
    sub         v2.8h, v16.8h, v17.8h
    ld1b        {z16.h}, p0/z, [x0]
    smull       v0.4s,  v2.4h,   v2.4h
    ld1b        {z17.h}, p0/z, [x2]
    smlal2      v0.4s,  v2.8h,   v2.8h
    add         x0, x0, x1
    add         x2, x2, x3
 .endm
 .macro SSD_SVE_8
    sub         v2.8h, v16.8h, v17.8h
    ld1b        {z16.h}, p0/z, [x0]
    smlal       v0.4s,  v2.4h,   v2.4h
    ld1b        {z17.h}, p0/z, [x2]
    smlal2      v0.4s,  v2.8h,   v2.8h
    add         x0, x0, x1
    add         x2, x2, x3
 .endm
 .macro SSD_END_SVE_8
    sub         v2.8h,  v16.8h,  v17.8h
    smlal       v0.4s,  v2.4h,   v2.4h
    smlal2      v0.4s,  v2.8h,   v2.8h
 .endm
 .macro SSD_FUNC_SVE w h
 function pixel_ssd_\w\()x\h\()_sve, export=1
    SSD_START_SVE_\w
 .rept \h-2
    SSD_SVE_\w
 .endr
    SSD_END_SVE_\w
    addv        s0,  v0.4s
    mov         w0,  v0.s[0]
    ret
 endfunc
 .endm
 .macro load_diff_fly_sve_8x8
    ld1b        {z1.h}, p0/z, [x2]
    ld1b        {z0.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    ld1b        {z3.h}, p0/z, [x2]
    ld1b        {z2.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    sub         v16.8h, v0.8h,  v1.8h
    sub         v17.8h, v2.8h,  v3.8h
    ld1b        {z5.h}, p0/z, [x2]
    ld1b        {z4.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    ld1b        {z7.h}, p0/z, [x2]
    ld1b        {z6.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    sub         v18.8h, v4.8h,  v5.8h
    sub         v19.8h, v6.8h,  v7.8h
    ld1b        {z1.h}, p0/z, [x2]
    ld1b        {z0.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    ld1b        {z3.h}, p0/z, [x2]
    ld1b        {z2.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    sub         v20.8h, v0.8h,  v1.8h
    sub         v21.8h, v2.8h,  v3.8h
    ld1b        {z5.h}, p0/z, [x2]
    ld1b        {z4.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    ld1b        {z7.h}, p0/z, [x2]
    ld1b        {z6.h}, p0/z, [x0]
    add         x2, x2, x3
    add         x0, x0, x1
    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
    sub         v22.8h, v4.8h,  v5.8h
    sub         v23.8h, v6.8h,  v7.8h
 .endm
 .macro pixel_var_sve_8 h
 function pixel_var_8x\h\()_sve, export=1
    ptrue           p0.h, vl8
    ld1b            {z16.h}, p0/z, [x0]
    add             x0, x0, x1
    ld1b            {z17.h}, p0/z, [x0]
    add             x0, x0, x1
    mov             x2,  \h - 4
    mul             v1.8h,  v16.8h, v16.8h
    mul             v2.8h,  v17.8h, v17.8h
    add             v0.8h,  v16.8h,  v17.8h
    ld1b            {z18.h}, p0/z, [x0]
    add             x0, x0, x1
    uaddlp          v1.4s,  v1.8h
    uaddlp          v2.4s,  v2.8h
    ld1b            {z19.h}, p0/z, [x0]
    add             x0, x0, x1
 1:  subs            x2,  x2,  #4
    add             v0.8h,  v0.8h,  v18.8h
    mul             v24.8h, v18.8h, v18.8h
    ld1b            {z20.h}, p0/z, [x0]
    add             x0, x0, x1
    add             v0.8h,  v0.8h,  v19.8h
    mul             v25.8h, v19.8h, v19.8h
    uadalp          v1.4s,  v24.8h
    ld1b            {z21.h}, p0/z, [x0]
    add             x0, x0, x1
    add             v0.8h,  v0.8h,  v20.8h
    mul             v26.8h, v20.8h, v20.8h
    uadalp          v2.4s,  v25.8h
    ld1b            {z18.h}, p0/z, [x0]
    add             x0, x0, x1
    add             v0.8h,  v0.8h,  v21.8h
    mul             v27.8h, v21.8h, v21.8h
    uadalp          v1.4s,  v26.8h
    ld1b            {z19.h}, p0/z, [x0]
    add             x0, x0, x1
    uadalp          v2.4s,  v27.8h
    b.gt            1b
    add             v0.8h,  v0.8h,  v18.8h
    mul             v28.8h, v18.8h, v18.8h
    add             v0.8h,  v0.8h,  v19.8h
    mul             v29.8h, v19.8h, v19.8h
    uadalp          v1.4s,  v28.8h
    uadalp          v2.4s,  v29.8h
    b               var_end
 endfunc
 .endm
 function var_end
    add             v1.4s,  v1.4s,  v2.4s
    uaddlv          s0,  v0.8h
    uaddlv          d1,  v1.4s
    mov             w0,  v0.s[0]
    mov             x1,  v1.d[0]
    orr             x0,  x0,  x1,  lsl #32
    ret
 endfunc
 .macro SUMSUBL_AB_SVE  sum, sub, a, b
    add         \sum,  \a,  \b
    sub         \sub,  \a,  \b
 .endm
 function pixel_sa8d_8x8_sve, export=1
    ptrue       p0.h, vl8
    mov         x4,  x30
    bl          pixel_sa8d_8x8_sve
    add         v0.8h,  v0.8h,  v1.8h
    uaddlv      s0,  v0.8h
    mov         w0,  v0.s[0]
    add         w0,  w0,  #1
    lsr         w0,  w0,  #1
    ret         x4
 endfunc
 .macro sa8d_satd_sve_8x8 satd=
 function pixel_sa8d_\satd\()8x8_sve
    load_diff_fly_sve_8x8
    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h,  v1.8h, v2.8h, v3.8h
 .ifc \satd, satd_
    transpose   v0.8h,  v1.8h,  v16.8h, v17.8h
    transpose   v2.8h,  v3.8h,  v18.8h, v19.8h
    transpose   v4.8h,  v5.8h,  v20.8h, v21.8h
    transpose   v6.8h,  v7.8h,  v22.8h, v23.8h
    SUMSUB_AB   v24.8h, v25.8h, v0.8h,  v1.8h
    SUMSUB_AB   v26.8h, v27.8h, v2.8h,  v3.8h
    SUMSUB_AB   v0.8h,  v1.8h,  v4.8h,  v5.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v6.8h,  v7.8h
    transpose   v4.4s,  v6.4s,  v24.4s, v26.4s
    transpose   v5.4s,  v7.4s,  v25.4s, v27.4s
    transpose   v24.4s, v26.4s, v0.4s,  v2.4s
    transpose   v25.4s, v27.4s, v1.4s,  v3.4s
    abs         v0.8h,  v4.8h
    abs         v1.8h,  v5.8h
    abs         v2.8h,  v6.8h
    abs         v3.8h,  v7.8h
    abs         v4.8h,  v24.8h
    abs         v5.8h,  v25.8h
    abs         v6.8h,  v26.8h
    abs         v7.8h,  v27.8h
    umax        v0.8h,  v0.8h,  v2.8h
    umax        v1.8h,  v1.8h,  v3.8h
    umax        v2.8h,  v4.8h,  v6.8h
    umax        v3.8h,  v5.8h,  v7.8h
    add         v26.8h, v0.8h,  v1.8h
    add         v27.8h, v2.8h,  v3.8h
 .endif
    SUMSUB_AB   v0.8h,  v16.8h, v16.8h, v20.8h
    SUMSUB_AB   v1.8h,  v17.8h, v17.8h, v21.8h
    SUMSUB_AB   v2.8h,  v18.8h, v18.8h, v22.8h
    SUMSUB_AB   v3.8h,  v19.8h, v19.8h, v23.8h
    transpose   v20.8h, v21.8h, v16.8h, v17.8h
    transpose   v4.8h,  v5.8h,  v0.8h,  v1.8h
    transpose   v22.8h, v23.8h, v18.8h, v19.8h
    transpose   v6.8h,  v7.8h,  v2.8h,  v3.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v20.8h, v21.8h
    SUMSUB_AB   v24.8h, v25.8h, v4.8h,  v5.8h
    SUMSUB_AB   v0.8h,  v1.8h,  v22.8h, v23.8h
    SUMSUB_AB   v4.8h,  v5.8h,  v6.8h,  v7.8h
    transpose   v20.4s, v22.4s, v2.4s,  v0.4s
    transpose   v21.4s, v23.4s, v3.4s,  v1.4s
    transpose   v16.4s, v18.4s, v24.4s, v4.4s
    transpose   v17.4s, v19.4s, v25.4s, v5.4s
    SUMSUB_AB   v0.8h,  v2.8h,  v20.8h, v22.8h
    SUMSUB_AB   v1.8h,  v3.8h,  v21.8h, v23.8h
    SUMSUB_AB   v4.8h,  v6.8h,  v16.8h, v18.8h
    SUMSUB_AB   v5.8h,  v7.8h,  v17.8h, v19.8h
    transpose   v16.2d, v20.2d,  v0.2d,  v4.2d
    transpose   v17.2d, v21.2d,  v1.2d,  v5.2d
    transpose   v18.2d, v22.2d,  v2.2d,  v6.2d
    transpose   v19.2d, v23.2d,  v3.2d,  v7.2d
    abs         v16.8h, v16.8h
    abs         v20.8h, v20.8h
    abs         v17.8h, v17.8h
    abs         v21.8h, v21.8h
    abs         v18.8h, v18.8h
    abs         v22.8h, v22.8h
    abs         v19.8h, v19.8h
    abs         v23.8h, v23.8h
    umax        v16.8h, v16.8h, v20.8h
    umax        v17.8h, v17.8h, v21.8h
    umax        v18.8h, v18.8h, v22.8h
    umax        v19.8h, v19.8h, v23.8h
    add         v0.8h,  v16.8h, v17.8h
    add         v1.8h,  v18.8h, v19.8h
    ret
 endfunc
 .endm
 .macro HADAMARD_AC_SVE w h
 function pixel_hadamard_ac_\w\()x\h\()_sve, export=1
    ptrue       p0.h, vl8
    movrel      x5, mask_ac_4_8
    mov         x4,  x30
    ld1         {v30.8h,v31.8h}, [x5]
    movi        v28.16b, #0
    movi        v29.16b, #0
    bl          hadamard_ac_8x8_sve
 .if \h > 8
    bl          hadamard_ac_8x8_sve
 .endif
 .if \w > 8
    sub         x0,  x0,  x1,  lsl #3
    add         x0,  x0,  #8
    bl          hadamard_ac_8x8_sve
 .endif
 .if \w * \h == 256
    sub         x0,  x0,  x1,  lsl #4
    bl          hadamard_ac_8x8_sve
 .endif
    addv        s1,  v29.4s
    addv        s0,  v28.4s
    mov         w1,  v1.s[0]
    mov         w0,  v0.s[0]
    lsr         w1,  w1,  #2
    lsr         w0,  w0,  #1
    orr         x0,  x0,  x1, lsl #32
    ret         x4
 endfunc
 .endm
 // v28: satd  v29: sa8d  v30: mask_ac4  v31: mask_ac8
 function hadamard_ac_8x8_sve
    ld1b        {z16.h}, p0/z, [x0]
    add         x0, x0, x1
    ld1b        {z17.h}, p0/z, [x0]
    add         x0, x0, x1
    ld1b        {z18.h}, p0/z, [x0]
    add         x0, x0, x1
    ld1b        {z19.h}, p0/z, [x0]
    add         x0, x0, x1
    SUMSUBL_AB_SVE  v0.8h,  v1.8h, v16.8h, v17.8h
    ld1b        {z20.h}, p0/z, [x0]
    add         x0, x0, x1
    ld1b        {z21.h}, p0/z, [x0]
    add         x0, x0, x1
    SUMSUBL_AB_SVE  v2.8h,  v3.8h, v18.8h, v19.8h
    ld1b        {z22.h}, p0/z, [x0]
    add         x0, x0, x1
    ld1b        {z23.h}, p0/z, [x0]
    add         x0, x0, x1
    SUMSUBL_AB_SVE  v4.8h,  v5.8h, v20.8h, v21.8h
    SUMSUBL_AB_SVE  v6.8h,  v7.8h, v22.8h, v23.8h
    SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h,  v2.8h,  v1.8h,  v3.8h
    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h
    transpose   v0.8h,  v1.8h,  v16.8h,  v17.8h
    transpose   v2.8h,  v3.8h,  v18.8h,  v19.8h
    transpose   v4.8h,  v5.8h,  v20.8h,  v21.8h
    transpose   v6.8h,  v7.8h,  v22.8h,  v23.8h
    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
    SUMSUB_AB   v20.8h, v21.8h, v4.8h,  v5.8h
    SUMSUB_AB   v22.8h, v23.8h, v6.8h,  v7.8h
    transpose   v0.4s,  v2.4s,  v16.4s, v18.4s
    transpose   v1.4s,  v3.4s,  v17.4s, v19.4s
    transpose   v4.4s,  v6.4s,  v20.4s, v22.4s
    transpose   v5.4s,  v7.4s,  v21.4s, v23.4s
    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h
    abs         v0.8h,  v16.8h
    abs         v4.8h,  v20.8h
    abs         v1.8h,  v17.8h
    abs         v5.8h,  v21.8h
    abs         v2.8h,  v18.8h
    abs         v6.8h,  v22.8h
    abs         v3.8h,  v19.8h
    abs         v7.8h,  v23.8h
    add         v0.8h,  v0.8h,  v4.8h
    add         v1.8h,  v1.8h,  v5.8h
    and         v0.16b, v0.16b, v30.16b
    add         v2.8h,  v2.8h,  v6.8h
    add         v3.8h,  v3.8h,  v7.8h
    add         v0.8h,  v0.8h,  v2.8h
    add         v1.8h,  v1.8h,  v3.8h
    uadalp      v28.4s, v0.8h
    uadalp      v28.4s, v1.8h
    SUMSUB_AB   v6.8h,  v7.8h,  v23.8h, v19.8h
    SUMSUB_AB   v4.8h,  v5.8h,  v22.8h, v18.8h
    SUMSUB_AB   v2.8h,  v3.8h,  v21.8h, v17.8h
    SUMSUB_AB   v1.8h,  v0.8h,  v16.8h,  v20.8h
    transpose   v16.2d, v17.2d,  v6.2d,  v7.2d
    transpose   v18.2d, v19.2d,  v4.2d,  v5.2d
    transpose   v20.2d, v21.2d,  v2.2d,  v3.2d
    abs         v16.8h,  v16.8h
    abs         v17.8h,  v17.8h
    abs         v18.8h,  v18.8h
    abs         v19.8h,  v19.8h
    abs         v20.8h,  v20.8h
    abs         v21.8h,  v21.8h
    transpose   v7.2d,  v6.2d,  v1.2d,  v0.2d
    umax        v3.8h,  v16.8h,  v17.8h
    umax        v2.8h,  v18.8h,  v19.8h
    umax        v1.8h,  v20.8h,  v21.8h
    SUMSUB_AB   v4.8h,  v5.8h,  v7.8h,  v6.8h
    add         v2.8h,  v2.8h,  v3.8h
    add         v2.8h,  v2.8h,  v1.8h
    and         v4.16b, v4.16b, v31.16b
    add         v2.8h,  v2.8h,  v2.8h
    abs         v5.8h,  v5.8h
    abs         v4.8h,  v4.8h
    add         v2.8h,  v2.8h,  v5.8h
    add         v2.8h,  v2.8h,  v4.8h
    uadalp      v29.4s, v2.8h
    ret
 endfunc
 SSD_FUNC_SVE   4, 4
 SSD_FUNC_SVE   4, 8
 SSD_FUNC_SVE   4, 16
 SSD_FUNC_SVE   8, 4
 SSD_FUNC_SVE   8, 8
 pixel_var_sve_8  8
 pixel_var_sve_8 16
 sa8d_satd_sve_8x8
 HADAMARD_AC_SVE  8, 8
 HADAMARD_AC_SVE  8, 16
 HADAMARD_AC_SVE 16, 8
 HADAMARD_AC_SVE 16, 16
 #else /* BIT_DEPTH == 10 */
 .macro SSD_START_SVE_4
    ptrue       p0.s, vl4
    ld1h        {z16.s}, p0/z, [x0]
    ld1h        {z17.s}, p0/z, [x2]
    add         x0, x0, x1, lsl #1
    add         x2, x2, x3, lsl #1
    sub         v2.4s, v16.4s, v17.4s
    ld1h        {z16.s}, p0/z, [x0]
    ld1h        {z17.s}, p0/z, [x2]
    add         x0, x0, x1, lsl #1
    add         x2, x2, x3, lsl #1
    mul         v0.4s, v2.4s, v2.4s
 .endm
 .macro SSD_SVE_4
    sub         v2.4s, v16.4s, v17.4s
    ld1h        {z16.s}, p0/z, [x0]
    ld1h        {z17.s}, p0/z, [x2]
    add         x0, x0, x1, lsl #1
    add         x2, x2, x3, lsl #1
    mla         v0.4s, v2.4s, v2.4s
 .endm
 .macro SSD_END_SVE_4
    sub         v2.4s, v16.4s, v17.4s
    mla         v0.4s,  v2.4s, v2.4s
 .endm
 .macro SSD_FUNC_SVE w h
 function pixel_ssd_\w\()x\h\()_sve, export=1
    SSD_START_SVE_\w
 .rept \h-2
    SSD_SVE_\w
 .endr
    SSD_END_SVE_\w
    addv        s0, v0.4s
    fmov        w0, s0
    ret
 endfunc
 .endm
 SSD_FUNC_SVE   4, 4
 SSD_FUNC_SVE   4, 8
 SSD_FUNC_SVE   4, 16
 #endif /* BIT_DEPTH == 8 */
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -0,0 +1,191 @@
 /*****************************************************************************
 * pixel.h: aarch64 pixel metrics
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_PIXEL_H
 #define X264_AARCH64_PIXEL_H
 #define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
 #define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
 #define x264_pixel_sad_4x16_neon x264_template(pixel_sad_4x16_neon)
 #define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
 #define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
 #define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
 #define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
 #define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
 #define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
 #define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
 #define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
 #define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
 #define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
 #define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
 #define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
 #define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
 #define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
 #define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
 #define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
 #define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
 #define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
 #define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
 #define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
 #define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
 #define x264_pixel_satd_4x16_neon x264_template(pixel_satd_4x16_neon)
 #define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
 #define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
 #define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
 #define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
 #define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
 #define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
 #define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
 #define x264_pixel_ssd_4x16_neon x264_template(pixel_ssd_4x16_neon)
 #define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
 #define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
 #define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
 #define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
 #define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
 #if HAVE_DOTPROD
 #define x264_pixel_sad_16x8_neon_dotprod x264_template(pixel_sad_16x8_neon_dotprod)
 #define x264_pixel_sad_16x16_neon_dotprod x264_template(pixel_sad_16x16_neon_dotprod)
 #define x264_pixel_sad_x3_16x16_neon_dotprod x264_template(pixel_sad_x3_16x16_neon_dotprod)
 #define x264_pixel_sad_x3_16x8_neon_dotprod x264_template(pixel_sad_x3_16x8_neon_dotprod)
 #define x264_pixel_sad_x4_16x16_neon_dotprod x264_template(pixel_sad_x4_16x16_neon_dotprod)
 #define x264_pixel_sad_x4_16x8_neon_dotprod x264_template(pixel_sad_x4_16x8_neon_dotprod)
 #define x264_pixel_ssd_16x16_neon_dotprod x264_template(pixel_ssd_16x16_neon_dotprod)
 #define x264_pixel_ssd_16x8_neon_dotprod x264_template(pixel_ssd_16x8_neon_dotprod)
 #define x264_pixel_ssd_8x16_neon_dotprod x264_template(pixel_ssd_8x16_neon_dotprod)
 #define x264_pixel_ssd_8x4_neon_dotprod x264_template(pixel_ssd_8x4_neon_dotprod)
 #define x264_pixel_ssd_8x8_neon_dotprod x264_template(pixel_ssd_8x8_neon_dotprod)
 #endif // HAVE_DOTPROD
 #define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
 #define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
 #define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
 #define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve)
 #define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve)
 #define DECL_PIXELS( ret, name, suffix, args ) \
    ret x264_pixel_##name##_16x16_##suffix args;\
    ret x264_pixel_##name##_16x8_##suffix args;\
    ret x264_pixel_##name##_8x16_##suffix args;\
    ret x264_pixel_##name##_8x8_##suffix args;\
    ret x264_pixel_##name##_8x4_##suffix args;\
    ret x264_pixel_##name##_4x16_##suffix args;\
    ret x264_pixel_##name##_4x8_##suffix args;\
    ret x264_pixel_##name##_4x4_##suffix args;
 #define DECL_PIXELS_SSD_SVE( ret, args ) \
    ret x264_pixel_ssd_8x8_sve args;\
    ret x264_pixel_ssd_8x4_sve args;\
    ret x264_pixel_ssd_4x16_sve args;\
    ret x264_pixel_ssd_4x8_sve args;\
    ret x264_pixel_ssd_4x4_sve args;
 #define DECL_X1( name, suffix ) \
    DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
 #define DECL_X1_SSD_SVE( ) \
    DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) )
 #define DECL_X4( name, suffix ) \
    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
 DECL_X1( sad, neon )
 DECL_X4( sad, neon )
 DECL_X1( satd, neon )
 DECL_X1( ssd, neon )
 DECL_X1_SSD_SVE( )
 #if HAVE_DOTPROD
 DECL_X1( sad, neon_dotprod )
 DECL_X4( sad, neon_dotprod )
 DECL_X1( ssd, neon_dotprod )
 #endif // HAVE_DOTPROD
 #define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
 void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * );
 #define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
 int x264_pixel_vsad_neon( pixel *, intptr_t, int );
 #if HAVE_DOTPROD
 #define x264_pixel_vsad_neon_dotprod x264_template(pixel_vsad_neon_dotprod)
 int x264_pixel_vsad_neon_dotprod( pixel *, intptr_t, int );
 #endif // HAVE_DOTPROD
 #define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
 int x264_pixel_sa8d_8x8_neon  ( pixel *, intptr_t, pixel *, intptr_t );
 #define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
 int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
 #define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
 uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
 #define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve)
 int x264_pixel_sa8d_8x8_sve  ( pixel *, intptr_t, pixel *, intptr_t );
 #define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
 uint64_t x264_pixel_var_8x8_neon  ( pixel *, intptr_t );
 #define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
 uint64_t x264_pixel_var_8x16_neon ( pixel *, intptr_t );
 #define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
 uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t );
 #define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
 int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * );
 #define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
 int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * );
 #define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve)
 uint64_t x264_pixel_var_8x8_sve  ( pixel *, intptr_t );
 #define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve)
 uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
 uint64_t x264_pixel_hadamard_ac_8x8_neon  ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
 uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
 uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
 uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve)
 uint64_t x264_pixel_hadamard_ac_8x8_sve  ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve)
 uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve)
 uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve)
 uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t );
 #define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
 void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t,
                                      const pixel *, intptr_t,
                                      int sums[2][4] );
 #define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
 float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
 #define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
 int x264_pixel_asd8_neon( pixel *, intptr_t,  pixel *, intptr_t, int );
 #endif
--- a/common/aarch64/predict-a.S
+++ b/common/aarch64/predict-a.S
@@ -0,0 +1,908 @@
 /*****************************************************************************
 * predict.S: aarch64 intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Mans Rullgard <mans@mansr.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 const p8weight, align=4
    .short      1, 2, 3, 4, 1, 2, 3, 4
 endconst
 const p16weight, align=4
    .short      1, 2, 3, 4, 5, 6, 7, 8
 endconst
 .macro ldcol.8  vd,  xn,  xm,  n=8,  hi=0
 .if \n == 8 || \hi == 0
    ld1         {\vd\().b}[0], [\xn], \xm
    ld1         {\vd\().b}[1], [\xn], \xm
    ld1         {\vd\().b}[2], [\xn], \xm
    ld1         {\vd\().b}[3], [\xn], \xm
 .endif
 .if \n == 8 || \hi == 1
    ld1         {\vd\().b}[4], [\xn], \xm
    ld1         {\vd\().b}[5], [\xn], \xm
    ld1         {\vd\().b}[6], [\xn], \xm
    ld1         {\vd\().b}[7], [\xn], \xm
 .endif
 .endm
 .macro ldcol.16  vd,  xn,  xm
    ldcol.8     \vd, \xn, \xm
    ld1         {\vd\().b}[ 8], [\xn], \xm
    ld1         {\vd\().b}[ 9], [\xn], \xm
    ld1         {\vd\().b}[10], [\xn], \xm
    ld1         {\vd\().b}[11], [\xn], \xm
    ld1         {\vd\().b}[12], [\xn], \xm
    ld1         {\vd\().b}[13], [\xn], \xm
    ld1         {\vd\().b}[14], [\xn], \xm
    ld1         {\vd\().b}[15], [\xn], \xm
 .endm
 function predict_4x4_h_aarch64, export=1
    ldurb   w1,  [x0, #0*FDEC_STRIDE-1]
    mov     w5,  #0x01010101
    ldrb    w2,  [x0, #1*FDEC_STRIDE-1]
    ldrb    w3,  [x0, #2*FDEC_STRIDE-1]
    mul     w1,  w1,  w5
    ldrb    w4,  [x0, #3*FDEC_STRIDE-1]
    mul     w2,  w2,  w5
    str     w1,  [x0, #0*FDEC_STRIDE]
    mul     w3,  w3,  w5
    str     w2,  [x0, #1*FDEC_STRIDE]
    mul     w4,  w4,  w5
    str     w3,  [x0, #2*FDEC_STRIDE]
    str     w4,  [x0, #3*FDEC_STRIDE]
    ret
 endfunc
 function predict_4x4_v_aarch64, export=1
    ldur    w1,  [x0, #0 - 1 * FDEC_STRIDE]
    str     w1,  [x0, #0 + 0 * FDEC_STRIDE]
    str     w1,  [x0, #0 + 1 * FDEC_STRIDE]
    str     w1,  [x0, #0 + 2 * FDEC_STRIDE]
    str     w1,  [x0, #0 + 3 * FDEC_STRIDE]
    ret
 endfunc
 function predict_4x4_dc_neon, export=1
    sub         x1,  x0,  #FDEC_STRIDE
    ldurb       w4,  [x0, #-1 + 0 * FDEC_STRIDE]
    ldrb        w5,  [x0, #-1 + 1 * FDEC_STRIDE]
    ldrb        w6,  [x0, #-1 + 2 * FDEC_STRIDE]
    ldrb        w7,  [x0, #-1 + 3 * FDEC_STRIDE]
    add         w4,  w4,  w5
    ldr         s0, [x1]
    add         w6,  w6,  w7
    uaddlv      h0,  v0.8b
    add         w4,  w4,  w6
    dup         v0.4h,  v0.h[0]
    dup         v1.4h,  w4
    add         v0.4h,  v0.4h,  v1.4h
    rshrn       v0.8b,  v0.8h,  #3
    str         s0,  [x0]
    str         s0,  [x0, #1 * FDEC_STRIDE]
    str         s0,  [x0, #2 * FDEC_STRIDE]
    str         s0,  [x0, #3 * FDEC_STRIDE]
    ret
 endfunc
 function predict_4x4_dc_top_neon, export=1
    sub         x1,  x0,  #FDEC_STRIDE
    ldr         s0, [x1]
    uaddlv      h0,  v0.8b
    dup         v0.4h,  v0.h[0]
    rshrn       v0.8b,  v0.8h,  #2
    str         s0,  [x0]
    str         s0,  [x0, #1 * FDEC_STRIDE]
    str         s0,  [x0, #2 * FDEC_STRIDE]
    str         s0,  [x0, #3 * FDEC_STRIDE]
    ret
    ret
 endfunc
 function predict_4x4_ddr_neon, export=1
    sub         x1,  x0,  #FDEC_STRIDE+1
    mov         x7,  #FDEC_STRIDE
    ld1         {v0.8b}, [x1], x7            // # -FDEC_STRIDE-1
    ld1r        {v1.8b}, [x1], x7            // #0*FDEC_STRIDE-1
    ld1r        {v2.8b}, [x1], x7            // #1*FDEC_STRIDE-1
    ext         v0.8b,  v1.8b,  v0.8b,  #7
    ld1r        {v3.8b}, [x1], x7            // #2*FDEC_STRIDE-1
    ext         v0.8b,  v2.8b,  v0.8b,  #7  // a
    ld1r        {v4.8b}, [x1], x7            // #3*FDEC_STRIDE-1
    ext         v1.8b,  v3.8b,  v0.8b,  #7  // b
    ext         v2.8b,  v4.8b,  v1.8b,  #7  // c
    uaddl       v0.8h,  v0.8b,  v1.8b
    uaddl       v1.8h,  v1.8b,  v2.8b
    add         v0.8h,  v0.8h,  v1.8h
    rshrn       v0.8b,  v0.8h,  #2
    ext         v3.8b,  v0.8b, v0.8b,  #3
    ext         v2.8b,  v0.8b, v0.8b,  #2
    ext         v1.8b,  v0.8b, v0.8b,  #1
    str         s3,  [x0], #FDEC_STRIDE
    str         s2,  [x0], #FDEC_STRIDE
    str         s1,  [x0], #FDEC_STRIDE
    str         s0,  [x0]
    ret
 endfunc
 function predict_4x4_ddl_neon, export=1
    sub         x0,  x0,  #FDEC_STRIDE
    mov         x7,  #FDEC_STRIDE
    ld1         {v0.8b}, [x0],  x7
    dup         v3.8b,  v0.b[7]
    ext         v1.8b,  v0.8b,  v0.8b,  #1
    ext         v2.8b,  v0.8b,  v3.8b,  #2
    uhadd       v0.8b,  v0.8b,  v2.8b
    urhadd      v0.8b,  v0.8b,  v1.8b
    str         s0,  [x0], #FDEC_STRIDE
    ext         v1.8b,  v0.8b,  v0.8b,  #1
    ext         v2.8b,  v0.8b,  v0.8b,  #2
    str         s1,  [x0], #FDEC_STRIDE
    ext         v3.8b,  v0.8b,  v0.8b,  #3
    str         s2,  [x0], #FDEC_STRIDE
    str         s3,  [x0]
    ret
 endfunc
 function predict_8x8_dc_neon, export=1
    mov         x7,  #FDEC_STRIDE
    ld1         {v0.16b}, [x1], #16
    ld1         {v1.8b},  [x1]
    ext         v0.16b, v0.16b, v0.16b, #7
    uaddlv      h1,  v1.8b
    uaddlv      h0,  v0.8b
    add         v0.8h,  v0.8h,  v1.8h
    dup         v0.8h,  v0.h[0]
    rshrn       v0.8b,  v0.8h,  #4
 .rept 8
    st1         {v0.8b}, [x0], x7
 .endr
    ret
 endfunc
 function predict_8x8_h_neon, export=1
    mov         x7,  #FDEC_STRIDE
    ld1         {v16.16b}, [x1]
    dup         v0.8b, v16.b[14]
    dup         v1.8b, v16.b[13]
    st1         {v0.8b}, [x0], x7
    dup         v2.8b, v16.b[12]
    st1         {v1.8b}, [x0], x7
    dup         v3.8b, v16.b[11]
    st1         {v2.8b}, [x0], x7
    dup         v4.8b, v16.b[10]
    st1         {v3.8b}, [x0], x7
    dup         v5.8b, v16.b[9]
    st1         {v4.8b}, [x0], x7
    dup         v6.8b, v16.b[8]
    st1         {v5.8b}, [x0], x7
    dup         v7.8b, v16.b[7]
    st1         {v6.8b}, [x0], x7
    st1         {v7.8b}, [x0], x7
    ret
 endfunc
 function predict_8x8_v_neon, export=1
    add         x1,  x1,  #16
    mov         x7,  #FDEC_STRIDE
    ld1         {v0.8b}, [x1]
 .rept 8
    st1         {v0.8b}, [x0], x7
 .endr
    ret
 endfunc
 function predict_8x8_ddl_neon, export=1
    add         x1,  x1,  #16
    mov         x7,  #FDEC_STRIDE
    ld1         {v0.16b}, [x1]
    movi        v3.16b, #0
    dup         v2.16b, v0.b[15]
    ext         v4.16b, v3.16b, v0.16b, #15
    ext         v2.16b, v0.16b, v2.16b, #1
    uhadd       v4.16b, v4.16b, v2.16b
    urhadd      v0.16b, v0.16b, v4.16b
    ext         v1.16b, v0.16b, v0.16b, #1
    ext         v2.16b, v0.16b, v0.16b, #2
    st1         {v1.8b}, [x0], x7
    ext         v3.16b, v0.16b, v0.16b, #3
    st1         {v2.8b}, [x0], x7
    ext         v4.16b, v0.16b, v0.16b, #4
    st1         {v3.8b}, [x0], x7
    ext         v5.16b, v0.16b, v0.16b, #5
    st1         {v4.8b}, [x0], x7
    ext         v6.16b, v0.16b, v0.16b, #6
    st1         {v5.8b}, [x0], x7
    ext         v7.16b, v0.16b, v0.16b, #7
    st1         {v6.8b}, [x0], x7
    ext         v0.16b, v0.16b, v0.16b, #8
    st1         {v7.8b}, [x0], x7
    st1         {v0.8b}, [x0], x7
    ret
 endfunc
 function predict_8x8_ddr_neon, export=1
    ld1         {v0.16b,v1.16b}, [x1]
    ext         v2.16b, v0.16b, v1.16b, #7
    ext         v4.16b, v0.16b, v1.16b, #9
    ext         v3.16b, v0.16b, v1.16b, #8
    uhadd       v2.16b, v2.16b, v4.16b
    urhadd      v7.16b, v3.16b, v2.16b
    add         x0,  x0,  #7*FDEC_STRIDE
    mov         x7,  #-1*FDEC_STRIDE
    ext         v6.16b, v7.16b, v7.16b, #1
    st1         {v7.8b},  [x0], x7
    ext         v5.16b, v7.16b, v7.16b, #2
    st1         {v6.8b},  [x0], x7
    ext         v4.16b, v7.16b, v7.16b, #3
    st1         {v5.8b},  [x0], x7
    ext         v3.16b, v7.16b, v7.16b, #4
    st1         {v4.8b},  [x0], x7
    ext         v2.16b, v7.16b, v7.16b, #5
    st1         {v3.8b},  [x0], x7
    ext         v1.16b, v7.16b, v7.16b, #6
    st1         {v2.8b},  [x0], x7
    ext         v0.16b, v7.16b, v7.16b, #7
    st1         {v1.8b},  [x0], x7
    st1         {v0.8b},  [x0], x7
    ret
 endfunc
 function predict_8x8_vl_neon, export=1
    add         x1,  x1,  #16
    mov         x7, #FDEC_STRIDE
    ld1         {v0.16b}, [x1]
    ext         v1.16b, v1.16b, v0.16b, #15
    ext         v2.16b, v0.16b, v2.16b, #1
    uhadd       v1.16b, v1.16b, v2.16b
    urhadd      v3.16b, v0.16b, v2.16b
    urhadd      v0.16b, v0.16b, v1.16b
    ext         v4.16b, v0.16b, v0.16b, #1
    st1         {v3.8b}, [x0], x7
    ext         v5.16b, v3.16b, v3.16b, #1
    st1         {v4.8b}, [x0], x7
    ext         v6.16b, v0.16b, v0.16b, #2
    st1         {v5.8b}, [x0], x7
    ext         v7.16b, v3.16b, v3.16b, #2
    st1         {v6.8b}, [x0], x7
    ext         v4.16b, v0.16b, v0.16b, #3
    st1         {v7.8b}, [x0], x7
    ext         v5.16b, v3.16b, v3.16b, #3
    st1         {v4.8b}, [x0], x7
    ext         v6.16b, v0.16b, v0.16b, #4
    st1         {v5.8b}, [x0], x7
    st1         {v6.8b}, [x0], x7
    ret
 endfunc
 function predict_8x8_vr_neon, export=1
    add         x1,  x1,  #8
    mov         x7,  #FDEC_STRIDE
    ld1         {v2.16b}, [x1]
    ext         v1.16b, v2.16b, v2.16b, #14
    ext         v0.16b, v2.16b, v2.16b, #15
    uhadd       v3.16b, v2.16b, v1.16b
    urhadd      v2.16b, v2.16b, v0.16b
    urhadd      v0.16b, v0.16b, v3.16b
    ext         v1.16b, v2.16b, v2.16b, #8
    uzp1        v2.8b,  v0.8b,  v0.8b
    uzp2        v3.8b,  v0.8b,  v0.8b
    ext         v0.16b, v0.16b, v0.16b, #8
    st1         {v1.8b}, [x0], x7
    st1         {v0.8b}, [x0], x7
    ext         v4.8b, v3.8b, v1.8b, #7
    ext         v5.8b, v2.8b, v0.8b, #7
    st1         {v4.8b}, [x0], x7
    st1         {v5.8b}, [x0], x7
    ext         v6.8b, v3.8b, v1.8b, #6
    ext         v7.8b, v2.8b, v0.8b, #6
    st1         {v6.8b}, [x0], x7
    st1         {v7.8b}, [x0], x7
    ext         v1.8b, v3.8b, v1.8b, #5
    ext         v0.8b, v2.8b, v0.8b, #5
    st1         {v1.8b}, [x0], x7
    st1         {v0.8b}, [x0], x7
    ret
 endfunc
 function predict_8x8_hd_neon, export=1
    add         x1,  x1,  #7
    mov         x7, #FDEC_STRIDE
    ld1         {v1.16b}, [x1]
    ext         v3.16b, v1.16b, v1.16b, #1
    ext         v2.16b, v1.16b, v1.16b, #2
    urhadd      v4.16b, v1.16b, v3.16b
    uhadd       v1.16b, v1.16b, v2.16b
    urhadd      v0.16b, v1.16b, v3.16b
    zip1        v16.8b, v4.8b,  v0.8b
    zip2        v17.8b, v4.8b,  v0.8b
    ext         v7.16b, v0.16b, v0.16b, #8
    ext         v0.8b,  v17.8b, v7.8b,  #6
    ext         v1.8b,  v17.8b, v7.8b,  #4
    st1         {v0.8b},  [x0], x7
    ext         v2.8b,  v17.8b, v7.8b,  #2
    st1         {v1.8b},  [x0], x7
    st1         {v2.8b},  [x0], x7
    ext         v3.8b,  v16.8b, v17.8b, #6
    st1         {v17.8b}, [x0], x7
    ext         v4.8b,  v16.8b, v17.8b, #4
    st1         {v3.8b},  [x0], x7
    ext         v5.8b,  v16.8b, v17.8b, #2
    st1         {v4.8b},  [x0], x7
    st1         {v5.8b},  [x0], x7
    st1         {v16.8b}, [x0], x7
    ret
 endfunc
 function predict_8x8_hu_neon, export=1
    add         x1,  x1,  #7
    mov         x7,  #FDEC_STRIDE
    ld1         {v7.8b}, [x1]
    dup         v6.8b,  v7.b[0]
    rev64       v7.8b,  v7.8b
    ext         v4.8b,  v7.8b,  v6.8b,  #2
    ext         v2.8b,  v7.8b,  v6.8b,  #1
    uhadd       v5.8b,  v7.8b,  v4.8b
    urhadd      v0.8b,  v2.8b,  v7.8b
    urhadd      v1.8b,  v5.8b,  v2.8b
    zip1        v16.8b, v0.8b,  v1.8b
    zip2        v17.8b, v0.8b,  v1.8b
    dup         v18.4h, v17.h[3]
    ext         v0.8b,  v16.8b, v17.8b, #2
    ext         v1.8b,  v16.8b, v17.8b, #4
    ext         v2.8b,  v16.8b, v17.8b, #6
    st1         {v16.8b}, [x0], x7
    st1         {v0.8b},  [x0], x7
    st1         {v1.8b},  [x0], x7
    st1         {v2.8b},  [x0], x7
    ext         v4.8b,  v17.8b, v18.8b, #2
    ext         v5.8b,  v17.8b, v18.8b, #4
    ext         v6.8b,  v17.8b, v18.8b, #6
    st1         {v17.8b}, [x0], x7
    st1         {v4.8b},  [x0], x7
    st1         {v5.8b},  [x0], x7
    st1         {v6.8b},  [x0]
    ret
 endfunc
 function predict_8x8c_dc_top_neon, export=1
    sub         x2,  x0,  #FDEC_STRIDE
    mov         x1,  #FDEC_STRIDE
    ld1         {v0.8b},  [x2]
    uaddlp      v0.4h,  v0.8b
    addp        v0.4h,  v0.4h,  v0.4h
    rshrn       v0.8b,  v0.8h,  #2
    dup         v3.8b,  v0.b[1]
    dup         v2.8b,  v0.b[0]
    transpose   v0.2s,  v1.2s,  v2.2s,  v3.2s
    b           pred8x8c_dc_end
 endfunc
 function predict_8x8c_dc_left_neon, export=1
    ldurb       w2,  [x0, #0 * FDEC_STRIDE - 1]
    ldrb        w3,  [x0, #1 * FDEC_STRIDE - 1]
    ldrb        w4,  [x0, #2 * FDEC_STRIDE - 1]
    ldrb        w5,  [x0, #3 * FDEC_STRIDE - 1]
    mov         x1,  #FDEC_STRIDE
    add         w2,  w2,  w3
    add         w3,  w4,  w5
    ldrb        w6,  [x0, #4 * FDEC_STRIDE - 1]
    ldrb        w7,  [x0, #5 * FDEC_STRIDE - 1]
    ldrb        w8,  [x0, #6 * FDEC_STRIDE - 1]
    ldrb        w9,  [x0, #7 * FDEC_STRIDE - 1]
    add         w6,  w6,  w7
    add         w7,  w8,  w9
    add         w2,  w2,  w3
    add         w6,  w6,  w7
    dup         v0.8h,  w2
    dup         v1.8h,  w6
    rshrn       v0.8b,  v0.8h,  #2
    rshrn       v1.8b,  v1.8h,  #2
    b           pred8x8c_dc_end
 endfunc
 function predict_8x8c_dc_neon, export=1
    mov         x1,  #FDEC_STRIDE
    sub         x2,  x0,  #FDEC_STRIDE
    ldurb       w10, [x0, #0 * FDEC_STRIDE - 1]
    ldrb        w11, [x0, #1 * FDEC_STRIDE - 1]
    ldrb        w12, [x0, #2 * FDEC_STRIDE - 1]
    ldrb        w13, [x0, #3 * FDEC_STRIDE - 1]
    add         w10, w10, w11
    ldrb        w4,  [x0, #4 * FDEC_STRIDE - 1]
    ldrb        w5,  [x0, #5 * FDEC_STRIDE - 1]
    add         w12, w12, w13
    ldrb        w6,  [x0, #6 * FDEC_STRIDE - 1]
    ldrb        w7,  [x0, #7 * FDEC_STRIDE - 1]
    add         w4,  w4,  w5
    add         w6,  w6,  w7
    add         w10, w10, w12, lsl #16
    add         w4,  w4,  w6,  lsl #16
    ld1         {v0.8b},  [x2]
    add         x10, x10, x4,  lsl #32
    uaddlp      v0.4h,  v0.8b  // s0, s1
    mov         v1.d[0],  x10  // s2, s3
    add         v3.4h,  v0.4h,  v1.4h
    addp        v0.4h,  v0.4h,  v1.4h // s0, s1, s2, s3
    addp        v1.4h,  v3.4h,  v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
    uzp2        v0.4h,  v0.4h,  v0.4h // s1,    s3,    s1,    s3
    uzp1        v1.2d,  v1.2d,  v1.2d
    uzp1        v0.2d,  v0.2d,  v0.2d
    rshrn       v3.8b,  v1.8h,  #3
    rshrn       v2.8b,  v0.8h,  #2
    uzp1        v0.8b,  v3.8b,  v2.8b
    uzp2        v1.8b,  v2.8b,  v3.8b
 pred8x8c_dc_end:
    add         x2,  x0,  #2 * FDEC_STRIDE
    add         x4,  x0,  #4 * FDEC_STRIDE
    add         x5,  x0,  #6 * FDEC_STRIDE
    st1         {v0.8b}, [x0], x1
    st1         {v0.8b}, [x2], x1
    st1         {v0.8b}, [x0]
    st1         {v0.8b}, [x2]
    st1         {v1.8b}, [x4], x1
    st1         {v1.8b}, [x5], x1
    st1         {v1.8b}, [x4]
    st1         {v1.8b}, [x5]
    ret
 endfunc
 function predict_8x8c_h_neon, export=1
    sub         x1,  x0,  #1
    mov         x7,  #FDEC_STRIDE
 .rept 4
    ld1r        {v0.8b}, [x1], x7
    ld1r        {v1.8b}, [x1], x7
    st1         {v0.8b}, [x0], x7
    st1         {v1.8b}, [x0], x7
 .endr
    ret
 endfunc
 function predict_8x8c_v_aarch64, export=1
    ldur        x1,  [x0, #-FDEC_STRIDE]
 .irp c, 0,1,2,3,4,5,6,7
    str         x1,  [x0, #\c * FDEC_STRIDE]
 .endr
    ret
 endfunc
 function predict_8x8c_p_neon, export=1
    sub         x3,  x0,  #FDEC_STRIDE
    mov         x1,  #FDEC_STRIDE
    add         x2,  x3,  #4
    sub         x3,  x3,  #1
    ld1         {v0.s}[0], [x3]
    ld1         {v2.s}[0], [x2], x1
    ldcol.8     v0,  x3,  x1,  4,  hi=1
    add         x3,  x3,  x1
    ldcol.8     v3,  x3,  x1,  4
    movrel      x4,  p8weight
    movrel      x5,  p16weight
    uaddl       v4.8h,  v2.8b,  v3.8b
    rev32       v0.8b,  v0.8b
    trn1        v2.2s,  v2.2s,  v3.2s
    ld1         {v7.8h}, [x4]
    usubl       v2.8h,  v2.8b,  v0.8b
    mul         v2.8h,  v2.8h,  v7.8h
    ld1         {v0.8h}, [x5]
    saddlp      v2.4s,  v2.8h
    addp        v2.4s,  v2.4s,  v2.4s
    shl         v3.2s,  v2.2s,  #4
    add         v2.2s,  v2.2s,  v3.2s
    rshrn       v5.4h,  v2.4s,  #5    // b, c, x, x
    addp        v2.4h,  v5.4h,  v5.4h
    shl         v3.4h,  v2.4h,  #2
    sub         v3.4h,  v3.4h,  v2.4h // 3 * (b + c)
    rev64       v4.4h,  v4.4h
    add         v4.4h,  v4.4h,  v0.4h
    shl         v2.4h,  v4.4h,  #4              // a
    sub         v2.4h,  v2.4h,  v3.4h           // a - 3 * (b + c) + 16
    ext         v0.16b, v0.16b, v0.16b, #14
    sub         v6.4h,  v5.4h,  v3.4h
    mov         v0.h[0],  wzr
    mul         v0.8h,  v0.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
    dup         v1.8h,  v2.h[0]                 // pix
    dup         v2.8h,  v5.h[1]                 // c
    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
    mov         x3,  #8
 1:
    subs        x3,  x3,  #1
    sqshrun     v0.8b,  v1.8h,  #5
    add         v1.8h,  v1.8h,  v2.8h
    st1         {v0.8b}, [x0], x1
    b.ne        1b
    ret
 endfunc
 .macro loadsum4 wd, t1, t2, t3, x, idx
  .if \idx == 0
    ldurb       \wd,  [\x, #(\idx + 0) * FDEC_STRIDE - 1]
  .else
    ldrb        \wd,  [\x, #(\idx + 0) * FDEC_STRIDE - 1]
  .endif
    ldrb        \t1,  [\x, #(\idx + 1) * FDEC_STRIDE - 1]
    ldrb        \t2,  [\x, #(\idx + 2) * FDEC_STRIDE - 1]
    ldrb        \t3,  [\x, #(\idx + 3) * FDEC_STRIDE - 1]
    add         \wd,  \wd,  \t1
    add         \t1,  \t2,  \t3
    add         \wd,  \wd,  \t1
 .endm
 function predict_8x16c_h_neon, export=1
    sub         x2,  x0,  #1
    add         x3,  x0,  #FDEC_STRIDE - 1
    mov         x7,  #2 * FDEC_STRIDE
    add         x1,  x0,  #FDEC_STRIDE
 .rept 4
    ld1r        {v0.8b}, [x2], x7
    ld1r        {v1.8b}, [x3], x7
    ld1r        {v2.8b}, [x2], x7
    ld1r        {v3.8b}, [x3], x7
    st1         {v0.8b}, [x0], x7
    st1         {v1.8b}, [x1], x7
    st1         {v2.8b}, [x0], x7
    st1         {v3.8b}, [x1], x7
 .endr
    ret
 endfunc
 function predict_8x16c_v_neon, export=1
    sub         x1,  x0,  #FDEC_STRIDE
    mov         x2,  #2 * FDEC_STRIDE
    ld1         {v0.8b}, [x1], x2
 .rept 8
    st1         {v0.8b}, [x0], x2
    st1         {v0.8b}, [x1], x2
 .endr
    ret
 endfunc
 function predict_8x16c_p_neon, export=1
    movrel      x4,  p16weight
    ld1         {v17.8h}, [x4]
    sub         x3,  x0,  #FDEC_STRIDE
    mov         x1,  #FDEC_STRIDE
    add         x2,  x3,  #4
    sub         x3,  x3,  #1
    ld1         {v0.8b}, [x3]
    ld1         {v2.8b}, [x2], x1
    ldcol.8     v1,  x3,  x1
    add         x3,  x3,  x1
    ldcol.8     v3,  x3,  x1
    ext         v4.8b,  v2.8b,  v2.8b,  #3
    ext         v5.8b,  v3.8b,  v3.8b,  #7
    rev32       v0.8b,  v0.8b
    rev64       v1.8b,  v1.8b
    uaddl       v4.8h,  v5.8b,  v4.8b // a * 1/16
    usubl       v2.8h,  v2.8b,  v0.8b
    mul         v2.8h,  v2.8h,  v17.8h
    saddlp      v2.4s,  v2.8h
    addp        v2.4s,  v2.4s,  v2.4s  // H
    usubl       v3.8h,  v3.8b,  v1.8b
    mul         v3.8h,  v3.8h,  v17.8h
    saddlp      v3.4s,  v3.8h
    addp        v3.4s,  v3.4s,  v3.4s
    addp        v3.4s,  v3.4s,  v3.4s  // V
    ext         v17.16b, v17.16b, v17.16b, #14
    shl         v4.4h,  v4.4h,  #4     // a
    shl         v6.2s,  v2.2s,  #4     // 16 * H
    shl         v7.2s,  v3.2s,  #2     // 4 * V
    add         v2.2s,  v2.2s,  v6.2s  // 17 * H
    add         v3.2s,  v3.2s,  v7.2s  // 5 * V
    rshrn       v2.4h,  v2.4s,  #5     // b
    rshrn       v3.4h,  v3.4s,  #6     // c
    mov         v17.h[0],  wzr
    sub         v4.4h,  v4.4h,  v2.4h  // a - b
    shl         v6.4h,  v2.4h,  #1     // 2 * b
    add         v4.4h,  v4.4h,  v3.4h  // a - b + c
    shl         v7.4h,  v3.4h,  #3     // 8 * c
    sub         v4.4h,  v4.4h,  v6.4h  // a - 3b + c
    sub         v4.4h,  v4.4h,  v7.4h  // a - 3b - 7c
    mul         v0.8h,  v17.8h, v2.h[0]         // 0,1,2,3,4,5,6,7 * b
    dup         v1.8h,  v4.h[0]                 // i00
    dup         v2.8h,  v3.h[0]                 // c
    add         v1.8h,  v1.8h,  v0.8h           // pix + {0..7}*b
    mov         x3,  #16
 1:
    subs        x3,  x3,  #2
    sqrshrun    v4.8b,  v1.8h,  #5
    add         v1.8h,  v1.8h,  v2.8h
    sqrshrun    v5.8b,  v1.8h,  #5
    st1         {v4.8b}, [x0], x1
    add         v1.8h,  v1.8h,  v2.8h
    st1         {v5.8b}, [x0], x1
    b.ne        1b
    ret
 endfunc
 function predict_8x16c_dc_neon, export=1
    mov         x1,  #FDEC_STRIDE
    sub         x10, x0,  #FDEC_STRIDE
    loadsum4    w2, w3, w4, w5, x0, 0
    ld1         {v6.8b}, [x10]
    loadsum4    w6, w7, w8, w9, x0, 4
    uaddlp      v6.4h,  v6.8b
    dup         v22.8h, w2              // s2
    dup         v23.8h, w6              // s3
    loadsum4    w2, w3, w4, w5, x0, 8
    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
    loadsum4    w6, w7, w8, w9, x0, 12
    dup         v20.8h, v6.h[0]         // s0
    dup         v21.8h, v6.h[1]         // s1
    dup         v24.8h, w2              // s4
    dup         v25.8h, w6              // s5
    ext         v16.16b, v20.16b, v21.16b, #8
    ext         v17.16b, v22.16b, v21.16b, #8
    ext         v1.16b,  v23.16b, v21.16b, #8
    ext         v2.16b,  v24.16b, v21.16b, #8
    ext         v3.16b,  v25.16b, v21.16b, #8
    add         v0.8h,  v16.8h, v17.8h
    add         v1.8h,  v1.8h,  v23.8h
    add         v2.8h,  v2.8h,  v24.8h
    add         v3.8h,  v3.8h,  v25.8h
    rshrn       v0.8b,  v0.8h,  #3
    rshrn       v1.8b,  v1.8h,  #3
    rshrn       v2.8b,  v2.8h,  #3
    rshrn       v3.8b,  v3.8h,  #3
    add         x11, x0,  #4  * FDEC_STRIDE
    add         x12, x0,  #8  * FDEC_STRIDE
    add         x13, x0,  #12 * FDEC_STRIDE
 .rept 4
    st1         {v0.8b}, [x0],  x1
    st1         {v1.8b}, [x11], x1
    st1         {v2.8b}, [x12], x1
    st1         {v3.8b}, [x13], x1
 .endr
    ret
 endfunc
 function predict_8x16c_dc_left_neon, export=1
    mov         x1,  #FDEC_STRIDE
    ldurb       w2,  [x0, # 0 * FDEC_STRIDE - 1]
    ldrb        w3,  [x0, # 1 * FDEC_STRIDE - 1]
    ldrb        w4,  [x0, # 2 * FDEC_STRIDE - 1]
    ldrb        w5,  [x0, # 3 * FDEC_STRIDE - 1]
    add         w2,  w2,  w3
    ldrb        w6,  [x0, # 4 * FDEC_STRIDE - 1]
    add         w4,  w4,  w5
    ldrb        w7,  [x0, # 5 * FDEC_STRIDE - 1]
    add         w2,  w2,  w4
    ldrb        w8,  [x0, # 6 * FDEC_STRIDE - 1]
    ldrb        w9,  [x0, # 7 * FDEC_STRIDE - 1]
    dup         v0.8h,  w2
    add         w6,  w6,  w7
    rshrn       v0.8b,  v0.8h,  #2
    add         w8,  w8,  w9
    ldrb        w10, [x0, # 8 * FDEC_STRIDE - 1]
    ldrb        w11, [x0, # 9 * FDEC_STRIDE - 1]
    add         w6,  w6,  w8
    ldrb        w12, [x0, #10 * FDEC_STRIDE - 1]
    ldrb        w13, [x0, #11 * FDEC_STRIDE - 1]
    dup         v1.8h,  w6
    add         w10,  w10,  w11
    rshrn       v1.8b,  v1.8h,  #2
    add         w12,  w12,  w13
    ldrb        w2,  [x0, #12 * FDEC_STRIDE - 1]
    ldrb        w3,  [x0, #13 * FDEC_STRIDE - 1]
    add         w10,  w10,  w12
    ldrb        w4,  [x0, #14 * FDEC_STRIDE - 1]
    ldrb        w5,  [x0, #15 * FDEC_STRIDE - 1]
    dup         v2.8h,  w10
    add         w2,  w2,  w3
    rshrn       v2.8b,  v2.8h,  #2
    add         w4,  w4,  w5
    st1         {v0.8b}, [x0], x1
    st1         {v0.8b}, [x0], x1
    add         w2,  w2,  w4
    st1         {v0.8b}, [x0], x1
    dup         v3.8h,  w2
    st1         {v0.8b}, [x0], x1
    rshrn       v3.8b,  v3.8h,  #2
 .irp  idx, 1, 2, 3
 .rept 4
    st1         {v\idx\().8b}, [x0], x1
 .endr
 .endr
    ret
 endfunc
 function predict_8x16c_dc_top_neon, export=1
    sub         x2,  x0,  #FDEC_STRIDE
    mov         x1,  #FDEC_STRIDE
    ld1         {v0.8b}, [x2]
    uaddlp      v0.4h,  v0.8b
    addp        v0.4h,  v0.4h,  v0.4h
    rshrn       v4.8b,  v0.8h,  #2
    dup         v0.8b,  v4.b[0]
    dup         v1.8b,  v4.b[1]
    ext         v0.8b,  v0.8b,  v1.8b,  #4
 .rept 16
    st1         {v0.8b}, [x0], x1
 .endr
    ret
 endfunc
 function predict_16x16_dc_top_neon, export=1
    sub         x2,  x0,  #FDEC_STRIDE
    mov         x1,  #FDEC_STRIDE
    ld1         {v0.16b}, [x2]
    uaddlv      h0,     v0.16b
    rshrn       v0.8b,  v0.8h,  #4
    dup         v0.16b, v0.b[0]
    b           pred16x16_dc_end
 endfunc
 function predict_16x16_dc_left_neon, export=1
    sub         x2,  x0,  #1
    mov         x1,  #FDEC_STRIDE
    ldcol.16    v0,  x2,  x1
    uaddlv      h0,     v0.16b
    rshrn       v0.8b,  v0.8h,  #4
    dup         v0.16b, v0.b[0]
    b           pred16x16_dc_end
 endfunc
 function predict_16x16_dc_neon, export=1
    sub         x3,  x0,  #FDEC_STRIDE
    sub         x2,  x0,  #1
    mov         x1,  #FDEC_STRIDE
    ld1         {v0.16b}, [x3]
    ldcol.16    v1,  x2,  x1
    uaddlv      h0,     v0.16b
    uaddlv      h1,     v1.16b
    add         v0.4h,  v0.4h,  v1.4h
    rshrn       v0.8b,  v0.8h,  #5
    dup         v0.16b, v0.b[0]
 pred16x16_dc_end:
 .rept 16
    st1         {v0.16b}, [x0], x1
 .endr
    ret
 endfunc
 function predict_16x16_h_neon, export=1
    sub         x1,  x0,  #1
    mov         x7, #FDEC_STRIDE
 .rept 8
    ld1r        {v0.16b}, [x1], x7
    ld1r        {v1.16b}, [x1], x7
    st1         {v0.16b}, [x0], x7
    st1         {v1.16b}, [x0], x7
 .endr
    ret
 endfunc
 function predict_16x16_v_neon, export=1
    sub         x0,  x0,  #FDEC_STRIDE
    mov         x7,  #FDEC_STRIDE
    ld1         {v0.16b}, [x0], x7
 .rept 16
    st1         {v0.16b}, [x0], x7
 .endr
    ret
 endfunc
 function predict_16x16_p_neon, export=1
    sub         x3,  x0,  #FDEC_STRIDE
    mov         x1,  #FDEC_STRIDE
    add         x2,  x3,  #8
    sub         x3,  x3,  #1
    ld1         {v0.8b}, [x3]
    ld1         {v2.8b}, [x2], x1
    ldcol.8     v1,  x3,  x1
    add         x3,  x3,  x1
    ldcol.8     v3,  x3,  x1
    rev64       v0.8b,  v0.8b
    rev64       v1.8b,  v1.8b
    movrel      x4,  p16weight
    uaddl       v4.8h,  v2.8b,  v3.8b
    ld1         {v7.8h}, [x4]
    usubl       v2.8h,  v2.8b,  v0.8b
    usubl       v3.8h,  v3.8b,  v1.8b
    mul         v2.8h,  v2.8h,  v7.8h
    mul         v3.8h,  v3.8h,  v7.8h
    saddlp      v2.4s,  v2.8h
    saddlp      v3.4s,  v3.8h
    addp        v2.4s,  v2.4s,  v3.4s
    addp        v2.4s,  v2.4s,  v2.4s
    shl         v3.2s,  v2.2s,  #2
    add         v2.2s,  v2.2s,  v3.2s
    rshrn       v5.4h,  v2.4s,  #6    // b, c, x, x
    addp        v2.4h,  v5.4h,  v5.4h
    shl         v3.4h,  v2.4h,  #3
    sub         v3.4h,  v3.4h,  v2.4h // 7 * (b + c)
    ext         v4.16b, v4.16b, v4.16b, #14
    add         v4.4h,  v4.4h,  v7.4h
    shl         v2.4h,  v4.4h,  #4              // a
    sub         v2.4h,  v2.4h,  v3.4h           // a - 7 * (b + c) + 16
    ext         v7.16b, v7.16b, v7.16b, #14
    mov         v7.h[0],  wzr
    dup         v3.8h,  v5.h[0]
    mul         v0.8h,  v7.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
    dup         v1.8h,  v2.h[0]                 // pix
    dup         v2.8h,  v5.h[1]                 // c
    shl         v3.8h,  v3.8h,  #3
    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
    add         v3.8h,  v3.8h,  v1.8h           // pix + x{8-15}*b
    mov         x3,  #16
 1:
    subs        x3,  x3,  #1
    sqshrun     v0.8b,  v1.8h,  #5
    add         v1.8h,  v1.8h,  v2.8h
    sqshrun2    v0.16b, v3.8h,  #5
    add         v3.8h,  v3.8h,  v2.8h
    st1         {v0.16b}, [x0], x1
    b.ne        1b
    ret
 endfunc
--- a/common/aarch64/predict-c.c
+++ b/common/aarch64/predict-c.c
@@ -0,0 +1,116 @@
 /*****************************************************************************
 * predict.c: aarch64 intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "predict.h"
 #include "pixel.h"
 void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] )
 {
 #if !HIGH_BIT_DEPTH
    if( cpu&X264_CPU_ARMV8 )
    {
        pf[I_PRED_4x4_H]   = x264_predict_4x4_h_aarch64;
        pf[I_PRED_4x4_V]   = x264_predict_4x4_v_aarch64;
    }
    if( cpu&X264_CPU_NEON )
    {
        pf[I_PRED_4x4_DC]     = x264_predict_4x4_dc_neon;
        pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
        pf[I_PRED_4x4_DDL]    = x264_predict_4x4_ddl_neon;
        pf[I_PRED_4x4_DDR]    = x264_predict_4x4_ddr_neon;
    }
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
 {
 #if !HIGH_BIT_DEPTH
    if( cpu&X264_CPU_ARMV8 )
    {
        pf[I_PRED_CHROMA_V]   = x264_predict_8x8c_v_aarch64;
    }
    if( !(cpu&X264_CPU_NEON) )
        return;
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_neon;
    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_neon;
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
 {
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_neon;
    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_neon;
    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_neon;
    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_neon;
    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
 {
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
    pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
    pf[I_PRED_8x8_VL]  = x264_predict_8x8_vl_neon;
    pf[I_PRED_8x8_VR]  = x264_predict_8x8_vr_neon;
    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
    pf[I_PRED_8x8_HD]  = x264_predict_8x8_hd_neon;
    pf[I_PRED_8x8_HU]  = x264_predict_8x8_hu_neon;
    pf[I_PRED_8x8_V]   = x264_predict_8x8_v_neon;
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
 {
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
 #endif // !HIGH_BIT_DEPTH
 }
--- a/common/aarch64/predict.h
+++ b/common/aarch64/predict.h
@@ -0,0 +1,119 @@
 /*****************************************************************************
 * predict.h: aarch64 intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_PREDICT_H
 #define X264_AARCH64_PREDICT_H
 #define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64)
 void x264_predict_4x4_h_aarch64( uint8_t *src );
 #define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64)
 void x264_predict_4x4_v_aarch64( uint8_t *src );
 #define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64)
 void x264_predict_8x8c_v_aarch64( uint8_t *src );
 // for the merged 4x4 intra sad/satd which expects unified suffix
 #define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
 #define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
 #define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
 #define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
 void x264_predict_4x4_dc_top_neon( uint8_t *src );
 #define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon)
 void x264_predict_4x4_ddr_neon( uint8_t *src );
 #define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
 void x264_predict_4x4_ddl_neon( uint8_t *src );
 #define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
 void x264_predict_8x8c_dc_top_neon( uint8_t *src );
 #define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
 void x264_predict_8x8c_dc_left_neon( uint8_t *src );
 #define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
 void x264_predict_8x8c_p_neon( uint8_t *src );
 #define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon)
 void x264_predict_8x16c_dc_left_neon( uint8_t *src );
 #define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
 void x264_predict_8x16c_dc_top_neon( uint8_t *src );
 #define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
 void x264_predict_8x16c_p_neon( uint8_t *src );
 #define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
 void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
 void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
 void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
 void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
 void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
 void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
 void x264_predict_16x16_dc_top_neon( uint8_t *src );
 #define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
 void x264_predict_16x16_dc_left_neon( uint8_t *src );
 #define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
 void x264_predict_16x16_p_neon( uint8_t *src );
 #define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon)
 void x264_predict_4x4_dc_neon( uint8_t *src );
 #define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
 void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
 void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
 void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
 void x264_predict_8x8c_dc_neon( uint8_t *src );
 #define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
 void x264_predict_8x8c_h_neon( uint8_t *src );
 #define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon)
 void x264_predict_8x16c_v_neon( uint8_t *src );
 #define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
 void x264_predict_8x16c_h_neon( uint8_t *src );
 #define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon)
 void x264_predict_8x16c_dc_neon( uint8_t *src );
 #define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
 void x264_predict_16x16_v_neon( uint8_t *src );
 #define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
 void x264_predict_16x16_h_neon( uint8_t *src );
 #define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
 void x264_predict_16x16_dc_neon( uint8_t *src );
 #define x264_predict_4x4_init_aarch64 x264_template(predict_4x4_init_aarch64)
 void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] );
 #define x264_predict_8x8_init_aarch64 x264_template(predict_8x8_init_aarch64)
 void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 #define x264_predict_8x8c_init_aarch64 x264_template(predict_8x8c_init_aarch64)
 void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
 #define x264_predict_8x16c_init_aarch64 x264_template(predict_8x16c_init_aarch64)
 void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
 #define x264_predict_16x16_init_aarch64 x264_template(predict_16x16_init_aarch64)
 void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
 #endif /* X264_AARCH64_PREDICT_H */
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -0,0 +1,95 @@
 /*****************************************************************************
 * quant.h: arm quantization and level-run
 *****************************************************************************
 * Copyright (C) 2005-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_AARCH64_QUANT_H
 #define X264_AARCH64_QUANT_H
 #define x264_quant_2x2_dc_aarch64 x264_template(quant_2x2_dc_aarch64)
 int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
 #define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
 int x264_quant_2x2_dc_neon( dctcoef dct[4], int mf, int bias );
 #define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
 int x264_quant_4x4_dc_neon( dctcoef dct[16], int mf, int bias );
 #define x264_quant_4x4_neon x264_template(quant_4x4_neon)
 int x264_quant_4x4_neon( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
 #define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
 int x264_quant_4x4x4_neon( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 #define x264_quant_8x8_neon x264_template(quant_8x8_neon)
 int x264_quant_8x8_neon( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 #define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
 void x264_dequant_4x4_dc_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
 void x264_dequant_4x4_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
 void x264_dequant_8x8_neon( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 #define x264_decimate_score15_neon x264_template(decimate_score15_neon)
 int x264_decimate_score15_neon( dctcoef * );
 #define x264_decimate_score16_neon x264_template(decimate_score16_neon)
 int x264_decimate_score16_neon( dctcoef * );
 #define x264_decimate_score64_neon x264_template(decimate_score64_neon)
 int x264_decimate_score64_neon( dctcoef * );
 // BIT DEPTH = 8
 #define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64)
 int x264_coeff_last4_aarch64( dctcoef * );
 #define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64)
 int x264_coeff_last8_aarch64( dctcoef * );
 // BIT DEPTH = 10
 #define x264_coeff_last4_neon x264_template(coeff_last4_neon)
 int x264_coeff_last4_neon( dctcoef * );
 #define x264_coeff_last8_neon x264_template(coeff_last8_neon)
 int x264_coeff_last8_neon( dctcoef * );
 #define x264_coeff_last15_neon x264_template(coeff_last15_neon)
 int x264_coeff_last15_neon( dctcoef * );
 #define x264_coeff_last16_neon x264_template(coeff_last16_neon)
 int x264_coeff_last16_neon( dctcoef * );
 #define x264_coeff_last64_neon x264_template(coeff_last64_neon)
 int x264_coeff_last64_neon( dctcoef * );
 // BIT_DEPTH = 8
 #define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64)
 int x264_coeff_level_run4_aarch64( dctcoef *, x264_run_level_t * );
 // BIT_DEPTH = 10
 #define x264_coeff_level_run4_neon x264_template(coeff_level_run4_neon)
 int x264_coeff_level_run4_neon( dctcoef *, x264_run_level_t * );
 #define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon)
 int x264_coeff_level_run8_neon( dctcoef *, x264_run_level_t * );
 #define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon)
 int x264_coeff_level_run15_neon( dctcoef *, x264_run_level_t * );
 #define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon)
 int x264_coeff_level_run16_neon( dctcoef *, x264_run_level_t * );
 #define x264_denoise_dct_neon x264_template(denoise_dct_neon)
 void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
 #endif
--- a/common/arm/asm.S
+++ b/common/arm/asm.S
@@ -0,0 +1,263 @@
 /*****************************************************************************
 * asm.S: arm utility macros
 *****************************************************************************
 * Copyright (C) 2008-2025 x264 project
 *
 * Authors: Mans Rullgard <mans@mansr.com>
 *          David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "config.h"
 .syntax unified
 #ifdef __ELF__
 .arch armv7-a
 .fpu neon
 #endif
 #define GLUE(a, b) a ## b
 #define JOIN(a, b) GLUE(a, b)
 #ifdef PREFIX
 #   define BASE _x264_
 #   define SYM_PREFIX _
 #else
 #   define BASE x264_
 #   define SYM_PREFIX
 #endif
 #ifdef BIT_DEPTH
 #   define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
 #else
 #   define EXTERN_ASM BASE
 #endif
 #define X(s) JOIN(EXTERN_ASM, s)
 #define X264(s) JOIN(BASE, s)
 #define EXT(s) JOIN(SYM_PREFIX, s)
 #ifdef __ELF__
 #   define ELF
 #else
 #   define ELF @
 #endif
 #ifdef __MACH__
 #   define MACH
 #   define NONMACH @
 #else
 #   define MACH @
 #   define NONMACH
 #endif
 #if HAVE_AS_FUNC
 #   define FUNC
 #else
 #   define FUNC @
 #endif
 #if SYS_LINUX || SYS_OPENBSD
 #define HAVE_SECTION_DATA_REL_RO 1
 #else
 #define HAVE_SECTION_DATA_REL_RO 0
 #endif
 .macro require8, val=1
 ELF     .eabi_attribute 24, \val
 .endm
 .macro preserve8, val=1
 ELF     .eabi_attribute 25, \val
 .endm
 .macro function name, export=1
    .macro endfunc
 .if \export
 ELF     .size   EXTERN_ASM\name, . - EXTERN_ASM\name
 .else
 ELF     .size   \name, . - \name
 .endif
 FUNC    .endfunc
        .purgem endfunc
    .endm
        .text
        .align  2
 .if \export == 1
        .global EXTERN_ASM\name
 ELF     .hidden EXTERN_ASM\name
 ELF     .type   EXTERN_ASM\name, %function
 FUNC    .func   EXTERN_ASM\name
 EXTERN_ASM\name:
 .else
 ELF     .hidden \name
 ELF     .type   \name, %function
 FUNC    .func   \name
 \name:
 .endif
 .endm
 .macro const name, align=2, relocate=0
    .macro endconst
 ELF     .size   \name, . - \name
        .purgem endconst
    .endm
 .if HAVE_SECTION_DATA_REL_RO && \relocate
        .section        .data.rel.ro
 .else
 NONMACH .section        .rodata
 MACH    .const_data
 .endif
        .align          \align
 \name:
 .endm
 .macro movrel rd, val
 #if defined(PIC)
        ldr             \rd,  1f
        b               2f
 1:
@ FIXME: thumb
        .word           \val - (2f + 8)
 2:
        add             \rd,  \rd,  pc
 #elif HAVE_ARMV6T2
        movw            \rd, #:lower16:\val
        movt            \rd, #:upper16:\val
 #else
        ldr             \rd, =\val
 #endif
 .endm
 .macro movrelx rd, val, got
 #if defined(PIC) && defined(__ELF__)
        ldr             \got, 2f
        ldr             \rd,  1f
        b               3f
 1:
@ FIXME: thumb
        .word \val(GOT)
 2:
        .word _GLOBAL_OFFSET_TABLE_ - (3f + 8)
 3:
        add             \got, \got, pc
        ldr             \rd, [\got, \rd]
 #elif defined(PIC) && defined(__APPLE__)
        ldr             \rd,  1f
        b               2f
 1:
@ FIXME: thumb
        .word           3f - (2f + 8)
 2:
        ldr             \rd, [pc, \rd]
        .non_lazy_symbol_pointer
 3:
        .indirect_symbol \val
        .word           0
        .text
 #else
        movrel          \rd, \val
 #endif
 .endm
 .macro movconst rd, val
 #if HAVE_ARMV6T2
    movw        \rd, #:lower16:\val
 .if \val >> 16
    movt        \rd, #:upper16:\val
 .endif
 #else
    ldr         \rd, =\val
 #endif
 .endm
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
 .macro HORIZ_ADD dest, a, b
 .ifnb \b
    vadd.u16    \a, \a, \b
 .endif
    vpaddl.u16  \a, \a
    vpaddl.u32  \dest, \a
 .endm
 .macro SUMSUB_AB sum, diff, a, b
    vadd.s16    \sum,  \a, \b
    vsub.s16    \diff, \a, \b
 .endm
 .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
    SUMSUB_AB   \s1, \d1, \a, \b
    SUMSUB_AB   \s2, \d2, \c, \d
 .endm
 .macro ABS2 a b
    vabs.s16 \a, \a
    vabs.s16 \b, \b
 .endm
 // dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
 // op = sumsub/amax (sum and diff / maximum of absolutes)
 // d1/2 = destination registers
 // s1/2 = source registers
 .macro HADAMARD dist, op, d1, d2, s1, s2
 .if \dist == 1
    vtrn.16     \s1, \s2
 .else
    vtrn.32     \s1, \s2
 .endif
 .ifc \op, sumsub
    SUMSUB_AB   \d1, \d2, \s1, \s2
 .else
    vabs.s16    \s1, \s1
    vabs.s16    \s2, \s2
    vmax.s16    \d1, \s1, \s2
 .endif
 .endm
 .macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
    vtrn.32         \r0, \r4
    vtrn.32         \r1, \r5
    vtrn.32         \r2, \r6
    vtrn.32         \r3, \r7
    vtrn.16         \r0, \r2
    vtrn.16         \r1, \r3
    vtrn.16         \r4, \r6
    vtrn.16         \r5, \r7
    vtrn.8          \r0, \r1
    vtrn.8          \r2, \r3
    vtrn.8          \r4, \r5
    vtrn.8          \r6, \r7
 .endm
 .macro TRANSPOSE4x4 r0 r1 r2 r3
    vtrn.16         \r0, \r2
    vtrn.16         \r1, \r3
    vtrn.8          \r0, \r1
    vtrn.8          \r2, \r3
 .endm
 .macro TRANSPOSE4x4_16  d0 d1 d2 d3
    vtrn.32     \d0, \d2
    vtrn.32     \d1, \d3
    vtrn.16     \d0, \d1
    vtrn.16     \d2, \d3
 .endm
--- a/common/arm/bitstream-a.S
+++ b/common/arm/bitstream-a.S
@@ -0,0 +1,84 @@
 /*****************************************************************************
 * bitstream-a.S: arm bitstream functions
 *****************************************************************************
 * Copyright (C) 2014-2025 x264 project
 *
 * Authors: Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 function nal_escape_neon
    push        {r4-r5,lr}
    vmov.u8     q0,  #0xff
    vmov.u8     q8,  #4
    mov         r3,  #3
    subs        lr,  r1,  r2
    beq         99f
 0:
    cmn         lr,  #15
    blt         16f
    mov         r1,  r2
    b           100f
 16:
    vld1.8      {q1}, [r1]!
    vext.8      q2,  q0,  q1, #14
    vext.8      q3,  q0,  q1, #15
    vcgt.u8     q11, q8,  q1
    vceq.u8     q9,  q2,  #0
    vceq.u8     q10, q3,  #0
    vand        q9,  q9,  q11
    vand        q9,  q9,  q10
    vshrn.u16   d22, q9,  #4
    vmov        ip,  lr,  d22
    orrs        ip,  ip,  lr
    beq         16f
    mov         lr,  #-16
 100:
    vmov.u8     r5,  d1[6]
    vmov.u8     r4,  d1[7]
    orr         r5,  r4,  r5, lsl #8
 101:
    ldrb        r4,  [r1, lr]
    orr         ip,  r4,  r5, lsl #16
    cmp         ip,  #3
    bhi         102f
    strb        r3,  [r0], #1
    orr         r5,  r3,  r5, lsl #8
 102:
    adds        lr,  lr,  #1
    strb        r4,  [r0], #1
    orr         r5,  r4,  r5, lsl #8
    blt         101b
    subs        lr,  r1,  r2
    lsr         ip,  r5,  #8
    vmov.u8     d1[6],  ip
    vmov.u8     d1[7],  r5
    blt         0b
    pop         {r4-r5,pc}
 16:
    subs        lr,  r1,  r2
    vst1.8      {q1}, [r0]!
    vmov        q0, q1
    blt         0b
 99:
    pop         {r4-r5,pc}
 endfunc
--- a/common/arm/bitstream.h
+++ b/common/arm/bitstream.h
@@ -0,0 +1,32 @@
 /*****************************************************************************
 * bitstream.h: arm bitstream functions
 *****************************************************************************
 * Copyright (C) 2017-2025 x264 project
 *
 * Authors: Anton Mitrofanov <BugMaster@narod.ru>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_ARM_BITSTREAM_H
 #define X264_ARM_BITSTREAM_H
 #define x264_nal_escape_neon x264_template(nal_escape_neon)
 uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
 #endif
--- a/common/arm/cpu-a.S
+++ b/common/arm/cpu-a.S
@@ -0,0 +1,108 @@
 /*****************************************************************************
 * cpu-a.S: arm cpu detection
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 .align 2
 // done in gas because .fpu neon overrides the refusal to assemble
 // instructions the selected -march/-mcpu doesn't support
 function cpu_neon_test
    vadd.i16    q0, q0, q0
    bx          lr
 endfunc
 // return: 0 on success
 //         1 if counters were already enabled
 //         9 if lo-res counters were already enabled
 function cpu_enable_armv7_counter, export=0
    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
    ands        r0, r2, #1
    andne       r0, r2, #9
    orr         r2, r2, #1                  // enable counters
    bic         r2, r2, #8                  // full resolution
    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
    mov         r2, #1 << 31                // enable cycle counter
    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
    bx          lr
 endfunc
 function cpu_disable_armv7_counter, export=0
    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
    bic         r0, r0, #1                  // disable counters
    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
    bx          lr
 endfunc
 .macro READ_TIME r
    mrc         p15, 0, \r, c9, c13, 0
 .endm
 // return: 0 if transfers neon -> arm transfers take more than 10 cycles
 //         nonzero otherwise
 function cpu_fast_neon_mrc_test
    // check for user access to performance counters
    mrc         p15, 0, r0, c9, c14, 0
    cmp         r0, #0
    bxeq        lr
    push        {r4-r6,lr}
    bl          cpu_enable_armv7_counter
    ands        r1, r0, #8
    mov         r3, #0
    mov         ip, #4
    mov         r6, #4
    moveq       r5, #1
    movne       r5, #64
 average_loop:
    mov         r4, r5
    READ_TIME   r1
 1:  subs        r4, r4, #1
 .rept 8
    vmov.u32    lr, d0[0]
    add         lr, lr, lr
 .endr
    bgt         1b
    READ_TIME   r2
    subs        r6, r6, #1
    sub         r2, r2, r1
    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
    addle       r3, r3, r2
    subsle      ip, ip, #1
    bgt         average_loop
    // disable counters if we enabled them
    ands        r0, r0, #1
    bleq        cpu_disable_armv7_counter
    lsr         r0, r3, #5
    cmp         r0, #10
    movgt       r0, #0
    pop         {r4-r6,pc}
 endfunc
--- a/common/arm/dct-a.S
+++ b/common/arm/dct-a.S
@@ -0,0 +1,764 @@
 /****************************************************************************
 * dct-a.S: arm transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Martin Storsjo <martin@martin.st>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 const scan4x4_frame, align=4
 .byte    0,1,   8,9,   2,3,   4,5
 .byte    2,3,   8,9,  16,17, 10,11
 .byte   12,13,  6,7,  14,15, 20,21
 .byte   10,11, 12,13,  6,7,  14,15
 endconst
 .text
 // sum = a + (b>>shift)   sub = (a>>shift) - b
 .macro SUMSUB_SHR shift sum sub a b t0 t1
    vshr.s16    \t0,  \b, #\shift
    vshr.s16    \t1,  \a, #\shift
    vadd.s16    \sum, \a, \t0
    vsub.s16    \sub, \t1, \b
 .endm
 // sum = (a>>shift) + b   sub = a - (b>>shift)
 .macro SUMSUB_SHR2 shift sum sub a b t0 t1
    vshr.s16    \t0,  \a, #\shift
    vshr.s16    \t1,  \b, #\shift
    vadd.s16    \sum, \t0, \b
    vsub.s16    \sub, \a, \t1
 .endm
 // a += 1.5*ma   b -= 1.5*mb
 .macro SUMSUB_15 a b ma mb t0 t1
    vshr.s16    \t0, \ma, #1
    vshr.s16    \t1, \mb, #1
    vadd.s16    \t0, \t0, \ma
    vadd.s16    \t1, \t1, \mb
    vadd.s16    \a,  \a,  \t0
    vsub.s16    \b,  \b,  \t1
 .endm
 function dct4x4dc_neon
    vld1.64         {d0-d3}, [r0,:128]
    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
    vmov.s16        d31, #1
    HADAMARD        1, sumsub, q2, q3, q0, q1
    vtrn.32         d4,  d5
    vadd.s16        d16, d4,  d31
    vtrn.32         d6,  d7
    vadd.s16        d17, d6,  d31
    vrhadd.s16      d0,  d4,  d5
    vhsub.s16       d1,  d16, d5
    vhsub.s16       d2,  d17, d7
    vrhadd.s16      d3,  d6,  d7
    vst1.64         {d0-d3}, [r0,:128]
    bx              lr
 endfunc
 function idct4x4dc_neon
    vld1.64         {d0-d3}, [r0,:128]
    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
    HADAMARD        1, sumsub, q2, q3, q0, q1
    HADAMARD        2, sumsub, d0, d1, d4, d5
    HADAMARD        2, sumsub, d3, d2, d6, d7
    vst1.64         {d0-d3}, [r0,:128]
    bx              lr
 endfunc
 .macro DCT_1D d0 d1 d2 d3  d4 d5 d6 d7
    SUMSUB_AB       \d1, \d6, \d5, \d6
    SUMSUB_AB       \d3, \d7, \d4, \d7
    vadd.s16        \d0, \d3, \d1
    vadd.s16        \d4, \d7, \d7
    vadd.s16        \d5, \d6, \d6
    vsub.s16        \d2, \d3, \d1
    vadd.s16        \d1, \d4, \d6
    vsub.s16        \d3, \d7, \d5
 .endm
 function sub4x4_dct_neon
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    vld1.32         {d0[]}, [r1,:32], r3
    vld1.32         {d1[]}, [r2,:32], ip
    vld1.32         {d2[]}, [r1,:32], r3
    vsubl.u8        q8,  d0,  d1
    vld1.32         {d3[]}, [r2,:32], ip
    vld1.32         {d4[]}, [r1,:32], r3
    vsubl.u8        q9,  d2,  d3
    vld1.32         {d5[]}, [r2,:32], ip
    vld1.32         {d6[]}, [r1,:32], r3
    vsubl.u8        q10, d4,  d5
    vld1.32         {d7[]}, [r2,:32], ip
    vsubl.u8        q11, d6,  d7
    DCT_1D          d0, d1, d2, d3, d16, d18, d20, d22
    TRANSPOSE4x4_16 d0, d1, d2, d3
    DCT_1D          d4, d5, d6, d7, d0, d1, d2, d3
    vst1.64         {d4-d7}, [r0,:128]
    bx              lr
 endfunc
 function sub8x4_dct_neon, export=0
    vld1.64         {d0}, [r1,:64], r3
    vld1.64         {d1}, [r2,:64], ip
    vsubl.u8        q8,  d0,  d1
    vld1.64         {d2}, [r1,:64], r3
    vld1.64         {d3}, [r2,:64], ip
    vsubl.u8        q9,  d2,  d3
    vld1.64         {d4}, [r1,:64], r3
    vld1.64         {d5}, [r2,:64], ip
    vsubl.u8        q10, d4,  d5
    vld1.64         {d6}, [r1,:64], r3
    vld1.64         {d7}, [r2,:64], ip
    vsubl.u8        q11, d6,  d7
    DCT_1D          q0, q1, q2, q3,  q8, q9, q10, q11
    TRANSPOSE4x4_16 q0, q1, q2, q3
    SUMSUB_AB       q8,  q12, q0,  q3
    SUMSUB_AB       q9,  q10, q1,  q2
    vadd.i16        q13, q12, q12
    vadd.i16        q11, q10, q10
    vadd.i16        d0,  d16, d18
    vadd.i16        d1,  d26, d20
    vsub.i16        d2,  d16, d18
    vsub.i16        d3,  d24, d22
    vst1.64         {d0-d1}, [r0,:128]!
    vadd.i16        d4,  d17, d19
    vadd.i16        d5,  d27, d21
    vst1.64         {d2-d3}, [r0,:128]!
    vsub.i16        d6,  d17, d19
    vsub.i16        d7,  d25, d23
    vst1.64         {d4-d5}, [r0,:128]!
    vst1.64         {d6-d7}, [r0,:128]!
    bx              lr
 endfunc
 function sub8x8_dct_neon
    push            {lr}
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    bl              sub8x4_dct_neon
    pop             {lr}
    b               sub8x4_dct_neon
 endfunc
 function sub16x16_dct_neon
    push            {lr}
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    bl              sub8x4_dct_neon
    bl              sub8x4_dct_neon
    sub             r1, r1, #8*FENC_STRIDE-8
    sub             r2, r2, #8*FDEC_STRIDE-8
    bl              sub8x4_dct_neon
    bl              sub8x4_dct_neon
    sub             r1, r1, #8
    sub             r2, r2, #8
    bl              sub8x4_dct_neon
    bl              sub8x4_dct_neon
    sub             r1, r1, #8*FENC_STRIDE-8
    sub             r2, r2, #8*FDEC_STRIDE-8
    bl              sub8x4_dct_neon
    pop             {lr}
    b               sub8x4_dct_neon
 endfunc
 .macro DCT8_1D type
    SUMSUB_AB       q2,  q1,  q11, q12  // s34/d34
    SUMSUB_AB       q3,  q11, q10, q13  // s25/d25
    SUMSUB_AB       q13, q10, q9,  q14  // s16/d16
    SUMSUB_AB       q14, q8,  q8,  q15  // s07/d07
    SUMSUB_AB       q9,  q2,  q14, q2   // a0/a2
    SUMSUB_AB       q12, q14, q13, q3   // a1/a3
    SUMSUB_AB       q3,  q13, q8,  q1   // a6/a5
    vshr.s16        q0,  q10, #1
    vshr.s16        q15, q11, #1
    vadd.s16        q0,  q0,  q10
    vadd.s16        q15, q15, q11
    vsub.s16        q3,  q3,  q0
    vsub.s16        q13, q13, q15
    SUMSUB_AB       q0,  q15, q10, q11  // a4/a7
    vshr.s16        q10, q8,  #1
    vshr.s16        q11, q1,  #1
    vadd.s16        q10, q10, q8
    vadd.s16        q11, q11, q1
    vadd.s16        q10, q0,  q10
    vadd.s16        q15, q15, q11
    SUMSUB_AB       q8,  q12, q9,  q12
    SUMSUB_SHR      2, q9,  q15, q10, q15,  q0, q1
    SUMSUB_SHR      1, q10, q14, q2,  q14,  q0, q1
    SUMSUB_SHR2     2, q11, q13, q3,  q13,  q0, q1
 .endm
 function sub8x8_dct8_neon
    mov             r3, #FENC_STRIDE
    mov             ip, #FDEC_STRIDE
    vld1.64         {d16}, [r1,:64], r3
    vld1.64         {d17}, [r2,:64], ip
    vsubl.u8        q8,  d16, d17
    vld1.64         {d18}, [r1,:64], r3
    vld1.64         {d19}, [r2,:64], ip
    vsubl.u8        q9,  d18, d19
    vld1.64         {d20}, [r1,:64], r3
    vld1.64         {d21}, [r2,:64], ip
    vsubl.u8        q10, d20, d21
    vld1.64         {d22}, [r1,:64], r3
    vld1.64         {d23}, [r2,:64], ip
    vsubl.u8        q11, d22, d23
    vld1.64         {d24}, [r1,:64], r3
    vld1.64         {d25}, [r2,:64], ip
    vsubl.u8        q12, d24, d25
    vld1.64         {d26}, [r1,:64], r3
    vld1.64         {d27}, [r2,:64], ip
    vsubl.u8        q13, d26, d27
    vld1.64         {d28}, [r1,:64], r3
    vld1.64         {d29}, [r2,:64], ip
    vsubl.u8        q14, d28, d29
    vld1.64         {d30}, [r1,:64], r3
    vld1.64         {d31}, [r2,:64], ip
    vsubl.u8        q15, d30, d31
    DCT8_1D         row
    vswp            d17, d24    // 8, 12
    vswp            d21, d28    // 10,14
    vtrn.32         q8,  q10
    vtrn.32         q12, q14
    vswp            d19, d26    // 9, 13
    vswp            d23, d30    // 11,15
    vtrn.32         q9,  q11
    vtrn.32         q13, q15
    vtrn.16         q10, q11
    vtrn.16         q12, q13
    vtrn.16         q8,  q9
    vtrn.16         q14, q15
    DCT8_1D         col
    vst1.64         {d16-d19}, [r0,:128]!
    vst1.64         {d20-d23}, [r0,:128]!
    vst1.64         {d24-d27}, [r0,:128]!
    vst1.64         {d28-d31}, [r0,:128]!
    bx              lr
 endfunc
 function sub16x16_dct8_neon
    push            {lr}
    bl              X(sub8x8_dct8_neon)
    sub             r1,  r1,  #FENC_STRIDE*8 - 8
    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
    bl              X(sub8x8_dct8_neon)
    sub             r1,  r1,  #8
    sub             r2,  r2,  #8
    bl              X(sub8x8_dct8_neon)
    pop             {lr}
    sub             r1,  r1,  #FENC_STRIDE*8 - 8
    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
    b               X(sub8x8_dct8_neon)
 endfunc
 // First part of IDCT (minus final SUMSUB_BA)
 .macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
    SUMSUB_AB       \d4, \d5, \d0, \d2
    vshr.s16        \d7, \d1, #1
    vshr.s16        \d6, \d3, #1
    vsub.s16        \d7, \d7, \d3
    vadd.s16        \d6, \d6, \d1
 .endm
 function add4x4_idct_neon
    mov             r2, #FDEC_STRIDE
    vld1.64         {d0-d3}, [r1,:128]
    IDCT_1D         d4, d5, d6, d7, d0, d1, d2, d3
    vld1.32         {d30[0]}, [r0,:32], r2
    SUMSUB_AB       q0, q1, q2, q3
    TRANSPOSE4x4_16 d0, d1, d3, d2
    IDCT_1D         d4, d5, d6, d7, d0, d1, d3, d2
    vld1.32         {d30[1]}, [r0,:32], r2
    SUMSUB_AB       q0, q1, q2, q3
    vrshr.s16       q0, q0, #6
    vld1.32         {d31[1]}, [r0,:32], r2
    vrshr.s16       q1, q1, #6
    vld1.32         {d31[0]}, [r0,:32], r2
    sub             r0, r0, r2, lsl #2
    vaddw.u8        q0, q0, d30
    vaddw.u8        q1, q1, d31
    vqmovun.s16     d0, q0
    vqmovun.s16     d2, q1
    vst1.32         {d0[0]}, [r0,:32], r2
    vst1.32         {d0[1]}, [r0,:32], r2
    vst1.32         {d2[1]}, [r0,:32], r2
    vst1.32         {d2[0]}, [r0,:32], r2
    bx              lr
 endfunc
 function add8x4_idct_neon, export=0
    vld1.64         {d0-d3}, [r1,:128]!
    IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
    vld1.64         {d4-d7}, [r1,:128]!
    IDCT_1D         d17, d19, d21, d23, d4, d5, d6, d7
    SUMSUB_AB       q0,  q3,  q8,  q10
    SUMSUB_AB       q1,  q2,  q9,  q11
    TRANSPOSE4x4_16 q0,  q1,  q2,  q3
    IDCT_1D         q8,  q9,  q10, q11, q0, q1, q2, q3
    SUMSUB_AB       q0,  q3,  q8,  q10
    SUMSUB_AB       q1,  q2,  q9,  q11
    vrshr.s16       q0,  q0,  #6
    vld1.32         {d28}, [r0,:64], r2
    vrshr.s16       q1,  q1,  #6
    vld1.32         {d29}, [r0,:64], r2
    vrshr.s16       q2,  q2,  #6
    vld1.32         {d30}, [r0,:64], r2
    vrshr.s16       q3,  q3,  #6
    vld1.32         {d31}, [r0,:64], r2
    sub             r0,  r0,  r2,  lsl #2
    vaddw.u8        q0,  q0,  d28
    vaddw.u8        q1,  q1,  d29
    vaddw.u8        q2,  q2,  d30
    vaddw.u8        q3,  q3,  d31
    vqmovun.s16     d0,  q0
    vqmovun.s16     d1,  q1
    vst1.32         {d0}, [r0,:64], r2
    vqmovun.s16     d2,  q2
    vst1.32         {d1}, [r0,:64], r2
    vqmovun.s16     d3,  q3
    vst1.32         {d2}, [r0,:64], r2
    vst1.32         {d3}, [r0,:64], r2
    bx              lr
 endfunc
 function add8x8_idct_neon
    mov             r2, #FDEC_STRIDE
    mov             ip, lr
    bl              add8x4_idct_neon
    mov             lr, ip
    b               add8x4_idct_neon
 endfunc
 function add16x16_idct_neon
    mov             r2, #FDEC_STRIDE
    mov             ip, lr
    bl              add8x4_idct_neon
    bl              add8x4_idct_neon
    sub             r0, r0, #8*FDEC_STRIDE-8
    bl              add8x4_idct_neon
    bl              add8x4_idct_neon
    sub             r0, r0, #8
    bl              add8x4_idct_neon
    bl              add8x4_idct_neon
    sub             r0, r0, #8*FDEC_STRIDE-8
    bl              add8x4_idct_neon
    mov             lr, ip
    b               add8x4_idct_neon
 endfunc
 .macro IDCT8_1D type
 .ifc \type, col
    vswp            d21, d28
 .endif
    SUMSUB_AB       q0,  q1,  q8,  q12              // a0/a2
 .ifc \type, row
    vld1.64         {d28-d31}, [r1,:128]!
 .else
    vswp            d19, d26
 .endif
    SUMSUB_SHR      1, q2,  q3,  q10, q14,  q8, q12    // a6/a4
 .ifc \type, col
    vswp            d23, d30
 .endif
    SUMSUB_AB       q8,  q10, q13, q11
    SUMSUB_15       q8,  q10, q9,  q15,  q12, q14   // a7/a1
    SUMSUB_AB       q14, q15, q15, q9
    SUMSUB_15       q15, q14, q13, q11,  q12, q9    // a5/a3
    SUMSUB_SHR      2, q13, q14, q14, q15,  q11, q9    // b3/b5
    SUMSUB_SHR2     2, q12, q15, q8,  q10,  q11, q9    // b1/b7
    SUMSUB_AB       q10, q2,  q0,  q2               // b0/b6
    SUMSUB_AB       q11, q3,  q1,  q3               // b2/b4
    SUMSUB_AB       q8,  q15, q10, q15
    SUMSUB_AB       q9,  q14, q11, q14
    SUMSUB_AB       q10, q13, q3,  q13
 .ifc \type, row
    vtrn.16         q8,  q9
 .endif
    SUMSUB_AB       q11, q12, q2,  q12
 .endm
 function add8x8_idct8_neon
    mov             r2,  #FDEC_STRIDE
    vld1.64         {d16-d19}, [r1,:128]!
    vld1.64         {d20-d23}, [r1,:128]!
    vld1.64         {d24-d27}, [r1,:128]!
    IDCT8_1D        row
    vtrn.16         q10, q11
    vtrn.16         q12, q13
    vtrn.16         q14, q15
    vtrn.32         q8,  q10
    vtrn.32         q9,  q11
    vtrn.32         q12, q14
    vtrn.32         q13, q15
    vswp            d17, d24
    IDCT8_1D        col
    vld1.64         {d0}, [r0,:64], r2
    vrshr.s16       q8,  q8,  #6
    vld1.64         {d1}, [r0,:64], r2
    vrshr.s16       q9,  q9,  #6
    vld1.64         {d2}, [r0,:64], r2
    vrshr.s16       q10, q10, #6
    vld1.64         {d3}, [r0,:64], r2
    vrshr.s16       q11, q11, #6
    vld1.64         {d4}, [r0,:64], r2
    vrshr.s16       q12, q12, #6
    vld1.64         {d5}, [r0,:64], r2
    vrshr.s16       q13, q13, #6
    vld1.64         {d6}, [r0,:64], r2
    vrshr.s16       q14, q14, #6
    vld1.64         {d7}, [r0,:64], r2
    vrshr.s16       q15, q15, #6
    sub             r0,  r0,  r2,  lsl #3
    vaddw.u8        q8,  q8,  d0
    vaddw.u8        q9,  q9,  d1
    vaddw.u8        q10, q10, d2
    vqmovun.s16     d0,  q8
    vqmovun.s16     d1,  q9
    vqmovun.s16     d2,  q10
    vaddw.u8        q11, q11, d3
    vst1.64         {d0}, [r0,:64], r2
    vaddw.u8        q12, q12, d4
    vst1.64         {d1}, [r0,:64], r2
    vaddw.u8        q13, q13, d5
    vst1.64         {d2}, [r0,:64], r2
    vqmovun.s16     d3,  q11
    vqmovun.s16     d4,  q12
    vaddw.u8        q14, q14, d6
    vaddw.u8        q15, q15, d7
    vst1.64         {d3}, [r0,:64], r2
    vqmovun.s16     d5,  q13
    vst1.64         {d4}, [r0,:64], r2
    vqmovun.s16     d6,  q14
    vqmovun.s16     d7,  q15
    vst1.64         {d5}, [r0,:64], r2
    vst1.64         {d6}, [r0,:64], r2
    vst1.64         {d7}, [r0,:64], r2
    bx              lr
 endfunc
 function add16x16_idct8_neon
    mov             ip,  lr
    bl              X(add8x8_idct8_neon)
    sub             r0,  r0,  #8*FDEC_STRIDE-8
    bl              X(add8x8_idct8_neon)
    sub             r0,  r0,  #8
    bl              X(add8x8_idct8_neon)
    sub             r0,  r0,  #8*FDEC_STRIDE-8
    mov             lr,  ip
    b               X(add8x8_idct8_neon)
 endfunc
 function add8x8_idct_dc_neon
    mov             r2,  #FDEC_STRIDE
    vld1.64         {d16}, [r1,:64]
    vrshr.s16       d16, d16, #6
    vld1.64         {d0}, [r0,:64], r2
    vmov.i16        q15, #0
    vld1.64         {d1}, [r0,:64], r2
    vld1.64         {d2}, [r0,:64], r2
    vdup.16         d20, d16[0]
    vld1.64         {d3}, [r0,:64], r2
    vdup.16         d21, d16[1]
    vld1.64         {d4}, [r0,:64], r2
    vdup.16         d22, d16[2]
    vld1.64         {d5}, [r0,:64], r2
    vdup.16         d23, d16[3]
    vld1.64         {d6}, [r0,:64], r2
    vsub.s16        q12, q15, q10
    vld1.64         {d7}, [r0,:64], r2
    vsub.s16        q13, q15, q11
    sub             r0,  r0,  #8*FDEC_STRIDE
    vqmovun.s16     d20, q10
    vqmovun.s16     d22, q11
    vqmovun.s16     d24, q12
    vqmovun.s16     d26, q13
    vmov            d21, d20
    vqadd.u8        q0,  q0,  q10
    vmov            d23, d22
    vqadd.u8        q1,  q1,  q10
    vmov            d25, d24
    vqadd.u8        q2,  q2,  q11
    vmov            d27, d26
    vqadd.u8        q3,  q3,  q11
    vqsub.u8        q0,  q0,  q12
    vqsub.u8        q1,  q1,  q12
    vqsub.u8        q2,  q2,  q13
    vst1.64         {d0}, [r0,:64], r2
    vqsub.u8        q3,  q3,  q13
    vst1.64         {d1}, [r0,:64], r2
    vst1.64         {d2}, [r0,:64], r2
    vst1.64         {d3}, [r0,:64], r2
    vst1.64         {d4}, [r0,:64], r2
    vst1.64         {d5}, [r0,:64], r2
    vst1.64         {d6}, [r0,:64], r2
    vst1.64         {d7}, [r0,:64], r2
    bx              lr
 endfunc
 .macro ADD16x4_IDCT_DC dc
    vld1.64         {d16-d17}, [r0,:128], r3
    vld1.64         {d18-d19}, [r0,:128], r3
    vdup.16         d4,  \dc[0]
    vdup.16         d5,  \dc[1]
    vld1.64         {d20-d21}, [r0,:128], r3
    vdup.16         d6,  \dc[2]
    vdup.16         d7,  \dc[3]
    vld1.64         {d22-d23}, [r0,:128], r3
    vsub.s16        q12, q15, q2
    vsub.s16        q13, q15, q3
    vqmovun.s16     d4,  q2
    vqmovun.s16     d5,  q3
    vqmovun.s16     d6,  q12
    vqmovun.s16     d7,  q13
    vqadd.u8        q8,  q8,  q2
    vqadd.u8        q9,  q9,  q2
    vqadd.u8        q10, q10, q2
    vqadd.u8        q11, q11, q2
    vqsub.u8        q8,  q8,  q3
    vqsub.u8        q9,  q9,  q3
    vqsub.u8        q10, q10, q3
    vst1.64         {d16-d17}, [r2,:128], r3
    vqsub.u8        q11, q11, q3
    vst1.64         {d18-d19}, [r2,:128], r3
    vst1.64         {d20-d21}, [r2,:128], r3
    vst1.64         {d22-d23}, [r2,:128], r3
 .endm
 function add16x16_idct_dc_neon
    mov             r2,  r0
    mov             r3,  #FDEC_STRIDE
    vmov.i16        q15, #0
    vld1.64         {d0-d3}, [r1,:64]
    vrshr.s16       q0, #6
    vrshr.s16       q1, #6
    ADD16x4_IDCT_DC d0
    ADD16x4_IDCT_DC d1
    ADD16x4_IDCT_DC d2
    ADD16x4_IDCT_DC d3
    bx              lr
 endfunc
 function sub8x8_dct_dc_neon
    mov             r3,  #FENC_STRIDE
    mov             ip,  #FDEC_STRIDE
    vld1.64         {d16}, [r1,:64], r3
    vld1.64         {d17}, [r2,:64], ip
    vsubl.u8        q8,  d16, d17
    vld1.64         {d18}, [r1,:64], r3
    vld1.64         {d19}, [r2,:64], ip
    vsubl.u8        q9,  d18, d19
    vld1.64         {d20}, [r1,:64], r3
    vld1.64         {d21}, [r2,:64], ip
    vsubl.u8        q10, d20, d21
    vld1.64         {d22}, [r1,:64], r3
    vadd.s16        q0,  q8,  q9
    vld1.64         {d23}, [r2,:64], ip
    vsubl.u8        q11, d22, d23
    vld1.64         {d24}, [r1,:64], r3
    vadd.s16        q0,  q0,  q10
    vld1.64         {d25}, [r2,:64], ip
    vsubl.u8        q12, d24, d25
    vld1.64         {d26}, [r1,:64], r3
    vadd.s16        q0,  q0,  q11
    vld1.64         {d27}, [r2,:64], ip
    vsubl.u8        q13, d26, d27
    vld1.64         {d28}, [r1,:64], r3
    vld1.64         {d29}, [r2,:64], ip
    vsubl.u8        q14, d28, d29
    vld1.64         {d30}, [r1,:64], r3
    vadd.s16        q1,  q12, q13
    vld1.64         {d31}, [r2,:64], ip
    vsubl.u8        q15, d30, d31
    vadd.s16        q1,  q1,  q14
    vadd.s16        d4,  d0,  d1
    vadd.s16        q1,  q1,  q15
    vsub.s16        d5,  d0,  d1
    vadd.s16        d6,  d2,  d3
    vsub.s16        d7,  d2,  d3
    vadd.s16        q0,  q2,  q3
    vsub.s16        q1,  q2,  q3
    vpadd.s16       d0,  d0,  d2
    vpadd.s16       d1,  d1,  d3
    vpadd.s16       d0,  d0,  d1
    vst1.64         {d0}, [r0,:64]
    bx              lr
 endfunc
 function sub8x16_dct_dc_neon
    mov             r3,  #FENC_STRIDE
    mov             ip,  #FDEC_STRIDE
    vld1.64         {d16}, [r1,:64], r3
    vld1.64         {d17}, [r2,:64], ip
    vsubl.u8        q8,  d16, d17
    vld1.64         {d18}, [r1,:64], r3
    vld1.64         {d19}, [r2,:64], ip
    vsubl.u8        q9,  d18, d19
    vld1.64         {d20}, [r1,:64], r3
    vld1.64         {d21}, [r2,:64], ip
    vsubl.u8        q10, d20, d21
    vld1.64         {d22}, [r1,:64], r3
    vadd.s16        q0,  q8,  q9
    vld1.64         {d23}, [r2,:64], ip
    vsubl.u8        q11, d22, d23
    vld1.64         {d24}, [r1,:64], r3
    vadd.s16        q0,  q0,  q10
    vld1.64         {d25}, [r2,:64], ip
    vsubl.u8        q12, d24, d25
    vld1.64         {d26}, [r1,:64], r3
    vadd.s16        q0,  q0,  q11
    vld1.64         {d27}, [r2,:64], ip
    vsubl.u8        q13, d26, d27
    vld1.64         {d28}, [r1,:64], r3
    vld1.64         {d29}, [r2,:64], ip
    vsubl.u8        q14, d28, d29
    vld1.64         {d30}, [r1,:64], r3
    vadd.s16        q1,  q12, q13
    vld1.64         {d31}, [r2,:64], ip
    vsubl.u8        q15, d30, d31
    vld1.64         {d16}, [r1,:64], r3
    vadd.s16        q1,  q1,  q14
    vld1.64         {d17}, [r2,:64], ip
    vadd.s16        q1,  q1,  q15
    vld1.64         {d18}, [r1,:64], r3
    vsubl.u8        q8,  d16, d17
    vld1.64         {d19}, [r2,:64], ip
    vsubl.u8        q9,  d18, d19
    vld1.64         {d20}, [r1,:64], r3
    vld1.64         {d21}, [r2,:64], ip
    vsubl.u8        q10, d20, d21
    vld1.64         {d22}, [r1,:64], r3
    vadd.s16        q2,  q8,  q9
    vld1.64         {d23}, [r2,:64], ip
    vsubl.u8        q11, d22, d23
    vld1.64         {d24}, [r1,:64], r3
    vadd.s16        q2,  q2,  q10
    vld1.64         {d25}, [r2,:64], ip
    vsubl.u8        q12, d24, d25
    vld1.64         {d26}, [r1,:64], r3
    vadd.s16        q2,  q2,  q11
    vld1.64         {d27}, [r2,:64], ip
    vsubl.u8        q13, d26, d27
    vld1.64         {d28}, [r1,:64], r3
    vld1.64         {d29}, [r2,:64], ip
    vsubl.u8        q14, d28, d29
    vld1.64         {d30}, [r1,:64], r3
    vadd.s16        q3,  q12, q13
    vld1.64         {d31}, [r2,:64], ip
    vsubl.u8        q15, d30, d31
    vadd.s16        q3,  q3,  q14
    vadd.s16        d16, d0,  d1  @ b0
    vadd.s16        q3,  q3,  q15
    vsub.s16        d17, d0,  d1  @ b4
    vadd.s16        d18, d2,  d3  @ b1
    vsub.s16        d19, d2,  d3  @ b5
    vadd.s16        d20, d4,  d5  @ b2
    vsub.s16        d21, d4,  d5  @ b6
    vadd.s16        d22, d6,  d7  @ b3
    vsub.s16        d23, d6,  d7  @ b7
    vadd.s16        q0,  q8,  q9  @ b0 + b1, b4 + b5; a0, a2
    vsub.s16        q1,  q8,  q9  @ b0 - b1, b4 - b5; a4, a6
    vadd.s16        q2,  q10, q11 @ b2 + b3, b6 + b7; a1, a3
    vsub.s16        q3,  q10, q11 @ b2 - b3, b6 - b7; a5, a7
    vadd.s16        q8,  q0,  q2  @ a0 + a1, a2 + a3
    vsub.s16        q9,  q0,  q2  @ a0 - a1, a2 - a3
    vsub.s16        q10, q1,  q3  @ a4 - a5, a6 - a7
    vadd.s16        q11, q1,  q3  @ a4 + a5, a6 + a7
    vpadd.s16       d0,  d16, d17
    vpadd.s16       d1,  d18, d19
    vpadd.s16       d2,  d20, d21
    vpadd.s16       d3,  d22, d23
    vpadd.s16       d0,  d0,  d1
    vpadd.s16       d1,  d2,  d3
    vst1.64         {q0}, [r0,:64]
    bx              lr
 endfunc
 function zigzag_scan_4x4_frame_neon
    movrel      r2, scan4x4_frame
    vld1.64     {d0-d3},   [r1,:128]
    vld1.64     {d16-d19}, [r2,:128]
    vtbl.8      d4, {d0-d1}, d16
    vtbl.8      d5, {d1-d3}, d17
    vtbl.8      d6, {d0-d2}, d18
    vtbl.8      d7, {d2-d3}, d19
    vst1.64     {d4-d7},   [r0,:128]
    bx          lr
 endfunc
--- a/common/arm/dct.h
+++ b/common/arm/dct.h
@@ -0,0 +1,70 @@
 /*****************************************************************************
 * dct.h: arm transform and zigzag
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_ARM_DCT_H
 #define X264_ARM_DCT_H
 #define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
 void x264_dct4x4dc_neon( int16_t d[16] );
 #define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
 void x264_idct4x4dc_neon( int16_t d[16] );
 #define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
 void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
 void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
 void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 #define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
 void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
 #define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
 void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
 #define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
 void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
 #define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
 void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
 #define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
 void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
 #define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
 void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
 void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
 void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
 #define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
 void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
 #define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
 void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
 #define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
 void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
 #define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
 void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
 #endif
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -0,0 +1,795 @@
 /*****************************************************************************
 * deblock.S: arm deblocking
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: Mans Rullgard <mans@mansr.com>
 *          Martin Storsjo <martin@martin.st>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 .macro h264_loop_filter_start
    ldr             ip,  [sp]
    ldr             ip,  [ip]
    vdup.32         d24, ip
    and             ip,  ip,  ip, lsl #16
    ands            ip,  ip,  ip, lsl #8
    bxlt            lr
 .endm
 .macro align_push_regs
    and             ip,  sp,  #15
    add             ip,  ip,  #32
    sub             sp,  sp,  ip
    vst1.64         {d12-d15}, [sp,:128]
    sub             sp,  sp,  #32
    vst1.64         {d8-d11},  [sp,:128]
 .endm
 .macro align_pop_regs
    vld1.64         {d8-d11},  [sp,:128]!
    vld1.64         {d12-d15}, [sp,:128], ip
 .endm
 .macro h264_loop_filter_luma
    vdup.8          q11, r2         @ alpha
    vmovl.u8        q12, d24
    vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
    vmovl.u16       q12, d24
    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
    vsli.16         q12, q12, #8
    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
    vsli.32         q12, q12, #16
    vclt.u8         q6,  q6,  q11   @ < alpha
    vdup.8          q11, r3         @ beta
    vclt.s8         q7,  q12, #0
    vclt.u8         q14, q14, q11   @ < beta
    vclt.u8         q15, q15, q11   @ < beta
    vbic            q6,  q6,  q7
    vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
    vand            q6,  q6,  q14
    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
    vclt.u8         q4,  q4,  q11   @ < beta
    vand            q6,  q6,  q15
    vclt.u8         q5,  q5,  q11   @ < beta
    vand            q4,  q4,  q6
    vand            q5,  q5,  q6
    vand            q12, q12, q6
    vrhadd.u8       q14, q8,  q0
    vsub.i8         q6,  q12, q4
    vqadd.u8        q7,  q9,  q12
    vhadd.u8        q10, q10, q14
    vsub.i8         q6,  q6,  q5
    vhadd.u8        q14, q2,  q14
    vmin.u8         q7,  q7,  q10
    vqsub.u8        q11, q9,  q12
    vqadd.u8        q2,  q1,  q12
    vmax.u8         q7,  q7,  q11
    vqsub.u8        q11, q1,  q12
    vmin.u8         q14, q2,  q14
    vmovl.u8        q2,  d0
    vmax.u8         q14, q14, q11
    vmovl.u8        q10, d1
    vsubw.u8        q2,  q2,  d16
    vsubw.u8        q10, q10, d17
    vshl.i16        q2,  q2,  #2
    vshl.i16        q10, q10, #2
    vaddw.u8        q2,  q2,  d18
    vaddw.u8        q10, q10, d19
    vsubw.u8        q2,  q2,  d2
    vsubw.u8        q10, q10, d3
    vrshrn.i16      d4,  q2,  #3
    vrshrn.i16      d5,  q10, #3
    vbsl            q4,  q7,  q9
    vbsl            q5,  q14, q1
    vneg.s8         q7,  q6
    vmovl.u8        q14, d16
    vmin.s8         q2,  q2,  q6
    vmovl.u8        q6,  d17
    vmax.s8         q2,  q2,  q7
    vmovl.u8        q11, d0
    vmovl.u8        q12, d1
    vaddw.s8        q14, q14, d4
    vaddw.s8        q6,  q6,  d5
    vsubw.s8        q11, q11, d4
    vsubw.s8        q12, q12, d5
    vqmovun.s16     d16, q14
    vqmovun.s16     d17, q6
    vqmovun.s16     d0,  q11
    vqmovun.s16     d1,  q12
 .endm
 function deblock_v_luma_neon
    h264_loop_filter_start
    vld1.64         {d0, d1},  [r0,:128], r1
    vld1.64         {d2, d3},  [r0,:128], r1
    vld1.64         {d4, d5},  [r0,:128], r1
    sub             r0,  r0,  r1, lsl #2
    sub             r0,  r0,  r1, lsl #1
    vld1.64         {d20,d21}, [r0,:128], r1
    vld1.64         {d18,d19}, [r0,:128], r1
    vld1.64         {d16,d17}, [r0,:128], r1
    align_push_regs
    h264_loop_filter_luma
    sub             r0,  r0,  r1, lsl #1
    vst1.64         {d8, d9},  [r0,:128], r1
    vst1.64         {d16,d17}, [r0,:128], r1
    vst1.64         {d0, d1},  [r0,:128], r1
    vst1.64         {d10,d11}, [r0,:128]
    align_pop_regs
    bx              lr
 endfunc
 function deblock_h_luma_neon
    h264_loop_filter_start
    sub             r0,  r0,  #4
    vld1.64         {d6},  [r0], r1
    vld1.64         {d20}, [r0], r1
    vld1.64         {d18}, [r0], r1
    vld1.64         {d16}, [r0], r1
    vld1.64         {d0},  [r0], r1
    vld1.64         {d2},  [r0], r1
    vld1.64         {d4},  [r0], r1
    vld1.64         {d26}, [r0], r1
    vld1.64         {d7},  [r0], r1
    vld1.64         {d21}, [r0], r1
    vld1.64         {d19}, [r0], r1
    vld1.64         {d17}, [r0], r1
    vld1.64         {d1},  [r0], r1
    vld1.64         {d3},  [r0], r1
    vld1.64         {d5},  [r0], r1
    vld1.64         {d27}, [r0], r1
    TRANSPOSE8x8    q3, q10, q9, q8, q0, q1, q2, q13
    align_push_regs
    h264_loop_filter_luma
    TRANSPOSE4x4    q4, q8, q0, q5
    sub             r0,  r0,  r1, lsl #4
    add             r0,  r0,  #2
    vst1.32         {d8[0]},  [r0], r1
    vst1.32         {d16[0]}, [r0], r1
    vst1.32         {d0[0]},  [r0], r1
    vst1.32         {d10[0]}, [r0], r1
    vst1.32         {d8[1]},  [r0], r1
    vst1.32         {d16[1]}, [r0], r1
    vst1.32         {d0[1]},  [r0], r1
    vst1.32         {d10[1]}, [r0], r1
    vst1.32         {d9[0]},  [r0], r1
    vst1.32         {d17[0]}, [r0], r1
    vst1.32         {d1[0]},  [r0], r1
    vst1.32         {d11[0]}, [r0], r1
    vst1.32         {d9[1]},  [r0], r1
    vst1.32         {d17[1]}, [r0], r1
    vst1.32         {d1[1]},  [r0], r1
    vst1.32         {d11[1]}, [r0], r1
    align_pop_regs
    bx              lr
 endfunc
 .macro h264_loop_filter_luma_intra
    vdup.8          q14, r2         @ alpha
    vabd.u8         q4,  q8,  q0    @ abs(p0 - q0)
    vabd.u8         q5,  q9,  q8    @ abs(p1 - p0)
    vabd.u8         q6,  q1,  q0    @ abs(q1 - q0)
    vdup.8          q15, r3         @ beta
    vmov.u8         q13, #2
    vclt.u8         q7,  q4,  q14   @ < alpha
    vshr.u8         q14, q14, #2    @ alpha >> 2
    vclt.u8         q5,  q5,  q15   @ < beta
    vadd.u8         q14, q14, q13   @ (alpha >> 2) + 2
    vand            q7,  q7,  q5
    vclt.u8         q6,  q6,  q15   @ < beta
    vclt.u8         q13, q4,  q14   @ < (alpha >> 2) + 2 if_2
    vand            q12, q7,  q6    @ if_1
    vshrn.u16       d28, q12,  #4
    vmov            r2,  lr,  d28
    orrs            r2,  r2,  lr
    beq             9f
    sub             sp,  sp,  #32
    vst1.8          {q12-q13}, [sp,:128]
    vshll.u8        q4,  d18, #1    @ 2*p1
    vshll.u8        q5,  d19, #1
    vaddw.u8        q4,  q4,  d16   @ 2*p1 + p0
    vaddw.u8        q5,  q5,  d17
    vaddw.u8        q4,  q4,  d2    @ 2*p1 + p0 + q1
    vaddw.u8        q5,  q5,  d3
    vrshrn.u16      d24, q4,  #2
    vrshrn.u16      d25, q5,  #2
    vaddl.u8        q6,  d20, d16   @ p2 + p0
    vaddl.u8        q7,  d21, d17
    vaddw.u8        q6,  q6,  d0    @ p2 + p0 + q0
    vaddw.u8        q7,  q7,  d1
    vadd.u16        q4,  q4,  q6    @ p2 + 2*p1 + 2*p0 + q0 + q1
    vadd.u16        q5,  q5,  q7
    vaddw.u8        q4,  q4,  d0    @ p2 + 2*p1 + 2*p0 + 2*q0 + q1
    vaddw.u8        q5,  q5,  d1
    vrshrn.u16      d26, q4,  #3    @ p0'_2
    vrshrn.u16      d27, q5,  #3
    vaddw.u8        q6,  q6,  d18   @ p2 + p1 + p0 + q0
    vaddw.u8        q7,  q7,  d19
    vrshrn.u16      d28, q6,  #2    @ p1'_2
    vrshrn.u16      d29, q7,  #2
    vaddl.u8        q4,  d22, d20   @ p3 + p2
    vaddl.u8        q5,  d23, d21
    vshl.u16        q4,  q4,  #1    @ 2*p3 + 2*p2
    vshl.u16        q5,  q5,  #1
    vadd.u16        q4,  q4,  q6    @ 2*p3 + 3*p2 + p1 + p0 + q0
    vadd.u16        q5,  q5,  q7
    vrshrn.u16      d30, q4,  #3    @ p2'_2
    vrshrn.u16      d31, q5,  #3
    vdup.8          q4,  r3         @ beta
    vabd.u8         q5,  q10, q8    @ abs(p2 - p0)
    vld1.8          {q6-q7}, [sp,:128]   @ if_1, if_2
    vclt.u8         q5,  q5,  q4    @ < beta if_3
    vand            q7,  q7,  q5    @ if_2 && if_3
    vmvn            q4,  q7
    vand            q7,  q7,  q6    @ if_1 && if_2 && if_3
    vand            q6,  q4,  q6    @ if_1 && !(if_2 && if_3)
    @ copy p0 to q15 so it can be clobbered
    vbit            q10, q15, q7
    vmov            q15, q8
    vbit            q8,  q12, q6
    @ wait for q9 to clobber
    vshll.u8        q4,  d2,  #1    @ 2*q1
    vshll.u8        q5,  d3,  #1
    vbit            q8,  q12, q6
    vaddw.u8        q4,  q4,  d0    @ 2*q1 + q0
    vaddw.u8        q5,  q5,  d1
    vbit            q8,  q13, q7
    vaddw.u8        q4,  q4,  d18   @ 2*q1 + q0 + p1
    vaddw.u8        q5,  q5,  d19
    vbit            q9,  q14, q7
    vrshrn.u16      d24, q4,  #2
    vrshrn.u16      d25, q5,  #2
    vaddl.u8        q6,  d4,  d0    @ q2 + q0
    vaddl.u8        q7,  d5,  d1
    vaddw.u8        q6,  q6,  d30   @ q2 + q0 + p0
    vaddw.u8        q7,  q7,  d31
    vadd.u16        q4,  q4,  q6    @ q2 + 2*q1 + 2*q0 + p0 + p1
    vadd.u16        q5,  q5,  q7
    vaddw.u8        q4,  q4,  d30   @ q2 + 2*q1 + 2*q0 + 2*p0 + p1
    vaddw.u8        q5,  q5,  d31
    vrshrn.u16      d26, q4,  #3    @ q0'_2
    vrshrn.u16      d27, q5,  #3
    vaddw.u8        q6,  q6,  d2    @ q2 + q1 + q0 + p0
    vaddw.u8        q7,  q7,  d3
    vrshrn.u16      d28, q6,  #2    @ q1'_2
    vrshrn.u16      d29, q7,  #2
    vaddl.u8        q4,  d6,  d4    @ q3 + q2
    vaddl.u8        q5,  d7,  d5
    vshl.u16        q4,  q4,  #1    @ 2*q3 + 2*q2
    vshl.u16        q5,  q5,  #1
    vadd.u16        q4,  q4,  q6    @ 2*q3 + 3*q2 + q1 + q0 + p0
    vadd.u16        q5,  q5,  q7
    vrshrn.u16      d30, q4,  #3    @ q2'_2
    vrshrn.u16      d31, q5,  #3
    vdup.8          q4,  r3         @ beta
    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
    vld1.8          {q6-q7}, [sp,:128]!   @ if_1, if_2
    vclt.u8         q5,  q5,  q4    @ < beta if_4
    vand            q7,  q7,  q5    @ if_2 && if_4
    vmvn            q4,  q7
    vand            q7,  q6,  q7    @ if_1 && if_2 && if_4
    vand            q6,  q6,  q4    @ if_1 && !(if_2 && if_4)
    vbit            q0,  q12, q6
    vbit            q1,  q14, q7
    vbit            q0,  q13, q7
    vbit            q2,  q15, q7
 .endm
 function deblock_v_luma_intra_neon
    push            {lr}
    vld1.64         {d0, d1},  [r0,:128], r1
    vld1.64         {d2, d3},  [r0,:128], r1
    vld1.64         {d4, d5},  [r0,:128], r1
    vld1.64         {d6, d7},  [r0,:128], r1
    sub             r0,  r0,  r1, lsl #3
    vld1.64         {d22,d23}, [r0,:128], r1
    vld1.64         {d20,d21}, [r0,:128], r1
    vld1.64         {d18,d19}, [r0,:128], r1
    vld1.64         {d16,d17}, [r0,:128]
    align_push_regs
    h264_loop_filter_luma_intra
    sub             r0,  r0,  r1, lsl #1
    vst1.64         {d20,d21}, [r0,:128], r1
    vst1.64         {d18,d19}, [r0,:128], r1
    vst1.64         {d16,d17}, [r0,:128], r1
    vst1.64         {d0, d1},  [r0,:128], r1
    vst1.64         {d2, d3},  [r0,:128], r1
    vst1.64         {d4, d5},  [r0,:128]
 9:
    align_pop_regs
    pop             {pc}
 endfunc
 function deblock_h_luma_intra_neon
    push            {lr}
    sub             r0,  r0,  #4
    vld1.64         {d22}, [r0], r1
    vld1.64         {d20}, [r0], r1
    vld1.64         {d18}, [r0], r1
    vld1.64         {d16}, [r0], r1
    vld1.64         {d0},  [r0], r1
    vld1.64         {d2},  [r0], r1
    vld1.64         {d4},  [r0], r1
    vld1.64         {d6},  [r0], r1
    vld1.64         {d23}, [r0], r1
    vld1.64         {d21}, [r0], r1
    vld1.64         {d19}, [r0], r1
    vld1.64         {d17}, [r0], r1
    vld1.64         {d1},  [r0], r1
    vld1.64         {d3},  [r0], r1
    vld1.64         {d5},  [r0], r1
    vld1.64         {d7},  [r0], r1
    TRANSPOSE8x8    q11, q10, q9, q8, q0, q1, q2, q3
    align_push_regs
    h264_loop_filter_luma_intra
    TRANSPOSE8x8    q11, q10, q9, q8, q0, q1, q2, q3
    sub             r0,  r0,  r1, lsl #4
    vst1.64         {d22}, [r0], r1
    vst1.64         {d20}, [r0], r1
    vst1.64         {d18}, [r0], r1
    vst1.64         {d16}, [r0], r1
    vst1.64         {d0},  [r0], r1
    vst1.64         {d2},  [r0], r1
    vst1.64         {d4},  [r0], r1
    vst1.64         {d6},  [r0], r1
    vst1.64         {d23}, [r0], r1
    vst1.64         {d21}, [r0], r1
    vst1.64         {d19}, [r0], r1
    vst1.64         {d17}, [r0], r1
    vst1.64         {d1},  [r0], r1
    vst1.64         {d3},  [r0], r1
    vst1.64         {d5},  [r0], r1
    vst1.64         {d7},  [r0], r1
 9:
    align_pop_regs
    pop             {pc}
 endfunc
 .macro h264_loop_filter_chroma
    vdup.8          q11, r2         // alpha
    vmovl.u8        q12, d24
    vabd.u8         q13, q8,  q0    // abs(p0 - q0)
    vabd.u8         q14, q9,  q8    // abs(p1 - p0)
    vsubl.u8        q2,  d0,  d16
    vsubl.u8        q3,  d1,  d17
    vsli.16         q12, q12, #8
    vshl.i16        q2,  q2,  #2
    vshl.i16        q3,  q3,  #2
    vabd.u8         q15, q1,  q0    // abs(q1 - q0)
    vmovl.u8        q12, d24
    vaddw.u8        q2,  q2,  d18
    vaddw.u8        q3,  q3,  d19
    vclt.u8         q13, q13, q11   // < alpha
    vsubw.u8        q2,  q2,  d2
    vsubw.u8        q3,  q3,  d3
    vsli.16         q12, q12, #8
    vdup.8          q11, r3         // beta
    vclt.s8         q10, q12, #0
    vrshrn.i16      d4,  q2,  #3
    vrshrn.i16      d5,  q3,  #3
    vclt.u8         q14, q14, q11   // < beta
    vbic            q13, q13, q10
    vclt.u8         q15, q15, q11   // < beta
    vand            q13, q13, q14
    vneg.s8         q10, q12
    vand            q13, q13, q15
    vmin.s8         q2,  q2,  q12
    vmovl.u8        q14, d16
    vand            q2,  q2,  q13
    vmovl.u8        q15, d17
    vmax.s8         q2,  q2,  q10
    vmovl.u8        q11, d0
    vmovl.u8        q12, d1
    vaddw.s8        q14, q14, d4
    vaddw.s8        q15, q15, d5
    vsubw.s8        q11, q11, d4
    vsubw.s8        q12, q12, d5
    vqmovun.s16     d16, q14
    vqmovun.s16     d17, q15
    vqmovun.s16     d0,  q11
    vqmovun.s16     d1,  q12
 .endm
 function deblock_v_chroma_neon
    h264_loop_filter_start
    sub             r0,  r0,  r1, lsl #1
    vld1.8          {d18,d19}, [r0,:128], r1
    vld1.8          {d16,d17}, [r0,:128], r1
    vld1.8          {d0, d1},  [r0,:128], r1
    vld1.8          {d2, d3},  [r0,:128]
    h264_loop_filter_chroma
    sub             r0,  r0,  r1, lsl #1
    vst1.8          {d16,d17}, [r0,:128], r1
    vst1.8          {d0, d1},  [r0,:128], r1
    bx              lr
 endfunc
 function deblock_h_chroma_neon
    h264_loop_filter_start
    sub             r0,  r0,  #4
 deblock_h_chroma:
    vld1.8          {d18}, [r0], r1
    vld1.8          {d16}, [r0], r1
    vld1.8          {d0},  [r0], r1
    vld1.8          {d2},  [r0], r1
    vld1.8          {d19}, [r0], r1
    vld1.8          {d17}, [r0], r1
    vld1.8          {d1},  [r0], r1
    vld1.8          {d3},  [r0], r1
    TRANSPOSE4x4_16 q9, q8, q0, q1
    h264_loop_filter_chroma
    vtrn.16         q8,  q0
    sub             r0,  r0,  r1, lsl #3
    add             r0,  r0,  #2
    vst1.32         {d16[0]}, [r0], r1
    vst1.32         {d0[0]},  [r0], r1
    vst1.32         {d16[1]}, [r0], r1
    vst1.32         {d0[1]},  [r0], r1
    vst1.32         {d17[0]}, [r0], r1
    vst1.32         {d1[0]},  [r0], r1
    vst1.32         {d17[1]}, [r0], r1
    vst1.32         {d1[1]},  [r0], r1
    bx              lr
 endfunc
 function deblock_h_chroma_422_neon
    h264_loop_filter_start
    push            {lr}
    sub             r0,  r0,  #4
    add             r1,  r1,  r1
    bl              deblock_h_chroma
    ldr             ip,  [sp, #4]
    ldr             ip,  [ip]
    vdup.32         d24, ip
    sub             r0,  r0,  r1, lsl #3
    add             r0,  r0,  r1, lsr #1
    sub             r0,  r0,  #2
    pop             {lr}
    b               deblock_h_chroma
 endfunc
 .macro h264_loop_filter_chroma8
    vdup.8          d22, r2         @ alpha
    vmovl.u8        q12, d24
    vabd.u8         d26, d16, d0    @ abs(p0 - q0)
    vabd.u8         d28, d18, d16   @ abs(p1 - p0)
    vsubl.u8        q2,  d0,  d16
    vsli.16         d24, d24, #8
    vshl.i16        q2,  q2,  #2
    vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
    vaddw.u8        q2,  q2,  d18
    vclt.u8         d26, d26, d22   @ < alpha
    vsubw.u8        q2,  q2,  d2
    vdup.8          d22, r3         @ beta
    vclt.s8         d20, d24, #0
    vrshrn.i16      d4,  q2,  #3
    vclt.u8         d28, d28, d22   @ < beta
    vbic            d26, d26, d20
    vclt.u8         d30, d30, d22   @ < beta
    vand            d26, d26, d28
    vneg.s8         d20, d24
    vand            d26, d26, d30
    vmin.s8         d4,  d4,  d24
    vmovl.u8        q14, d16
    vand            d4,  d4,  d26
    vmax.s8         d4,  d4,  d20
    vmovl.u8        q11, d0
    vaddw.s8        q14, q14, d4
    vsubw.s8        q11, q11, d4
    vqmovun.s16     d16, q14
    vqmovun.s16     d0,  q11
 .endm
 function deblock_h_chroma_mbaff_neon
    h264_loop_filter_start
    sub             r0,  r0,  #4
    vld1.8          {d18}, [r0], r1
    vld1.8          {d16}, [r0], r1
    vld1.8          {d0},  [r0], r1
    vld1.8          {d2},  [r0], r1
    TRANSPOSE4x4_16 d18, d16, d0, d2
    h264_loop_filter_chroma8
    vtrn.16         d16, d0
    sub             r0,  r0,  r1, lsl #2
    add             r0,  r0,  #2
    vst1.32         {d16[0]}, [r0], r1
    vst1.32         {d0[0]},  [r0], r1
    vst1.32         {d16[1]}, [r0], r1
    vst1.32         {d0[1]},  [r0]
    bx              lr
 endfunc
 .macro h264_loop_filter_chroma_intra, width=16
    vdup.8          q11, r2         @ alpha
    vabd.u8         q13, q8,  q0    @ abs(p0 - q0)
    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
    vclt.u8         q13, q13, q11   @ < alpha
    vdup.8          q11, r3         @ beta
    vclt.u8         q14, q14, q11   @ < beta
    vclt.u8         q15, q15, q11   @ < beta
    vand            q13, q13, q14
    vand            q13, q13, q15
    vshll.u8        q14, d18, #1
    vshll.u8        q2,  d2,  #1
 .ifc \width, 16
    vshll.u8        q15, d19, #1
    vshll.u8        q3,  d3,  #1
    vaddl.u8        q12, d17, d3
    vaddl.u8        q10, d1,  d19
 .endif
    vaddl.u8        q11, d16, d2
    vaddl.u8        q1,  d18, d0    @ or vaddw q2, to not clobber q1
    vadd.u16        q14, q14, q11
    vadd.u16        q2,  q2,  q1
 .ifc \width, 16
    vadd.u16        q15, q15, q12
    vadd.u16        q3,  q3,  q10
 .endif
    vqrshrn.u16     d28, q14, #2
    vqrshrn.u16     d4,  q2, #2
 .ifc \width, 16
    vqrshrn.u16     d29, q15, #2
    vqrshrn.u16     d5,  q3, #2
 .endif
    vbit            q8,  q14, q13
    vbit            q0,  q2,  q13
 .endm
 function deblock_v_chroma_intra_neon
    sub             r0,  r0,  r1, lsl #1
    vld2.8          {d18,d19}, [r0,:128], r1
    vld2.8          {d16,d17}, [r0,:128], r1
    vld2.8          {d0, d1},  [r0,:128], r1
    vld2.8          {d2, d3},  [r0,:128]
    h264_loop_filter_chroma_intra
    sub             r0,  r0,  r1, lsl #1
    vst2.8          {d16,d17}, [r0,:128], r1
    vst2.8          {d0, d1},  [r0,:128], r1
    bx              lr
 endfunc
 function deblock_h_chroma_intra_neon
    sub             r0,  r0,  #4
    vld1.8          {d18}, [r0], r1
    vld1.8          {d16}, [r0], r1
    vld1.8          {d0},  [r0], r1
    vld1.8          {d2},  [r0], r1
    vld1.8          {d19}, [r0], r1
    vld1.8          {d17}, [r0], r1
    vld1.8          {d1},  [r0], r1
    vld1.8          {d3},  [r0], r1
    TRANSPOSE4x4_16 q9, q8, q0, q1
    h264_loop_filter_chroma_intra
    vtrn.16         q8,  q0
    sub             r0,  r0,  r1, lsl #3
    add             r0,  r0,  #2
    vst1.32         {d16[0]}, [r0], r1
    vst1.32         {d0[0]},  [r0], r1
    vst1.32         {d16[1]}, [r0], r1
    vst1.32         {d0[1]},  [r0], r1
    vst1.32         {d17[0]}, [r0], r1
    vst1.32         {d1[0]},  [r0], r1
    vst1.32         {d17[1]}, [r0], r1
    vst1.32         {d1[1]},  [r0], r1
    bx              lr
 endfunc
 function deblock_h_chroma_422_intra_neon
    push            {lr}
    bl              X(deblock_h_chroma_intra_neon)
    add             r0, r0,  #2
    pop             {lr}
    b               X(deblock_h_chroma_intra_neon)
 endfunc
 function deblock_h_chroma_intra_mbaff_neon
    sub             r0,  r0,  #4
    vld1.8          {d18}, [r0], r1
    vld1.8          {d16}, [r0], r1
    vld1.8          {d0},  [r0], r1
    vld1.8          {d2},  [r0], r1
    TRANSPOSE4x4_16 d18, d16, d0, d2
    h264_loop_filter_chroma_intra width=8
    vtrn.16         d16, d0
    sub             r0,  r0,  r1, lsl #2
    add             r0,  r0,  #2
    vst1.32         {d16[0]}, [r0], r1
    vst1.32         {d0[0]},  [r0], r1
    vst1.32         {d16[1]}, [r0], r1
    vst1.32         {d0[1]},  [r0]
    bx              lr
 endfunc
 function deblock_strength_neon
    ldr             ip,  [sp]
    vmov.i8         q8,  #0
    lsl             ip,  ip,  #8
    add             r3,  r3,  #32
    sub             ip,  ip,  #(1<<8)-3
    vmov.i8         q9,  #0
    vdup.16         q10, ip
    ldr             ip,  [sp, #4]
 lists:
    @ load bytes ref
    vld1.8          {d31}, [r1]!
    add             r2,  r2,  #16
    vld1.8          {q1},  [r1]!
    vmov.i8         q0,  #0
    vld1.8          {q2},  [r1]!
    vext.8          q3,  q0,  q1,  #15
    vext.8          q0,  q0,  q2,  #15
    vuzp.32         q1,  q2
    vuzp.32         q3,  q0
    vext.8          q1,  q15, q2,  #12
    veor            q0,  q0,  q2
    veor            q1,  q1,  q2
    vorr            q8,  q8,  q0
    vorr            q9,  q9,  q1
    vld1.16         {q11}, [r2,:128]!   @ mv + 0x10
    vld1.16         {q3},  [r2,:128]!   @ mv + 0x20
    vld1.16         {q12}, [r2,:128]!   @ mv + 0x30
    vld1.16         {q2},  [r2,:128]!   @ mv + 0x40
    vld1.16         {q13}, [r2,:128]!   @ mv + 0x50
    vext.8          q3,  q3,  q12, #12
    vext.8          q2,  q2,  q13, #12
    vabd.s16        q0,  q12, q3
    vld1.16         {q3},  [r2,:128]!   @ mv + 0x60
    vabd.s16        q1,  q13, q2
    vld1.16         {q14}, [r2,:128]!   @ mv + 0x70
    vqmovn.u16      d0,  q0
    vld1.16         {q2},  [r2,:128]!   @ mv + 0x80
    vld1.16         {q15}, [r2,:128]!   @ mv + 0x90
    vqmovn.u16      d1,  q1
    vext.8          q3,  q3,  q14, #12
    vext.8          q2,  q2,  q15, #12
    vabd.s16        q3,  q14, q3
    vabd.s16        q2,  q15, q2
    vqmovn.u16      d2,  q3
    vqmovn.u16      d3,  q2
    vqsub.u8        q0,  q0,  q10
    vqsub.u8        q1,  q1,  q10
    vqmovn.u16      d0,  q0
    vqmovn.u16      d1,  q1
    vabd.s16        q1,  q12, q13
    vorr            q8,  q8,  q0
    vabd.s16        q0,  q11, q12
    vabd.s16        q2,  q13, q14
    vabd.s16        q3,  q14, q15
    vqmovn.u16      d0,  q0
    vqmovn.u16      d1,  q1
    vqmovn.u16      d2,  q2
    vqmovn.u16      d3,  q3
    vqsub.u8        q0,  q0,  q10
    vqsub.u8        q1,  q1,  q10
    vqmovn.u16      d0,  q0
    vqmovn.u16      d1,  q1
    subs            ip,  ip,  #1
    vorr            q9,  q9,  q0
    beq             lists
    mov             ip,  #-32
    @ load bytes nnz
    vld1.8          {d31}, [r0]!
    vld1.8          {q1},  [r0]!
    vmov.i8         q0,  #0
    vld1.8          {q2},  [r0]
    vext.8          q3,  q0,  q1,  #15
    vext.8          q0,  q0,  q2,  #15
    vuzp.32         q1,  q2
    vuzp.32         q3,  q0
    vext.8          q1,  q15, q2,  #12
    vorr            q0,  q0,  q2
    vorr            q1,  q1,  q2
    vmov.u8         q10, #1
    vmin.u8         q0,  q0,  q10
    vmin.u8         q1,  q1,  q10
    vmin.u8         q8,  q8,  q10       @ mv ? 1 : 0
    vmin.u8         q9,  q9,  q10
    vadd.u8         q0,  q0,  q0        @ nnz ? 2 : 0
    vadd.u8         q1,  q1,  q1
    vmax.u8         q8,  q8,  q0
    vmax.u8         q9,  q9,  q1
    vzip.16         d16, d17
    vst1.8          {q9}, [r3,:128], ip @ bs[1]
    vtrn.8          d16, d17
    vtrn.32         d16, d17
    vst1.8          {q8}, [r3,:128]     @ bs[0]
    bx              lr
 endfunc
--- a/common/arm/deblock.h
+++ b/common/arm/deblock.h
@@ -0,0 +1,58 @@
 /*****************************************************************************
 * deblock.h: arm deblocking
 *****************************************************************************
 * Copyright (C) 2017-2025 x264 project
 *
 * Authors: Anton Mitrofanov <BugMaster@narod.ru>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_ARM_DEBLOCK_H
 #define X264_ARM_DEBLOCK_H
 #define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
 void x264_deblock_v_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
 void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
 void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
 void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_strength_neon x264_template(deblock_strength_neon)
 void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                 int mvy_limit, int bframe );
 #define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
 void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
 void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
 void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
 void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
 void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
 void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
 void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
 void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #endif
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -0,0 +1,366 @@
 /*****************************************************************************
 * mc-c.c: arm motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "mc.h"
 #define x264_prefetch_ref_arm x264_template(prefetch_ref_arm)
 void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
 #define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm)
 void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
 void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
 #define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
 void x264_memzero_aligned_neon( void *dst, size_t n );
 #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
 void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
 void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
 void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
 void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
 void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
 void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
 void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
 void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
 void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
 void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
 void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
 void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
                                pixel *src, intptr_t i_src, int w, int h );
 #define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                         pixel *dstv, intptr_t i_dstv,
                                         pixel *src,  intptr_t i_src, int w, int h );
 #define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
 void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
                                            pixel *dstb, intptr_t i_dstb,
                                            pixel *dstc, intptr_t i_dstc,
                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
 #define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
 void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
                                           pixel *srcu, intptr_t i_srcu,
                                           pixel *srcv, intptr_t i_srcv, int w, int h );
 #define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
 void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
                                     pixel *src, intptr_t i_src, int w, int h );
 #define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
 void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 #define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
 void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
 void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
 #define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
 #define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
 #define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
 #define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
 #define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
 #define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
 #define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
 #define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
 #define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
 #define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
 #define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
 #define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
 #define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
 #define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
 #define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
 #if !HIGH_BIT_DEPTH
 #define MC_WEIGHT(func)\
 void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 \
 static weight_fn_t mc##func##_wtab_neon[6] =\
 {\
    x264_mc_weight_w4##func##_neon,\
    x264_mc_weight_w4##func##_neon,\
    x264_mc_weight_w8##func##_neon,\
    x264_mc_weight_w16##func##_neon,\
    x264_mc_weight_w16##func##_neon,\
    x264_mc_weight_w20##func##_neon,\
 };
 MC_WEIGHT()
 MC_WEIGHT(_nodenom)
 MC_WEIGHT(_offsetadd)
 MC_WEIGHT(_offsetsub)
 #endif
 #define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
 void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
 void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
 void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon)
 void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_chroma_neon x264_template(mc_chroma_neon)
 void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
 #define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 #define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon)
 void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
 #define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon)
 void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
 #define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon)
 void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
 #define x264_integral_init4h_neon x264_template(integral_init4h_neon)
 void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
 #define x264_integral_init4v_neon x264_template(integral_init4v_neon)
 void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
 #define x264_integral_init8h_neon x264_template(integral_init8h_neon)
 void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
 #define x264_integral_init8v_neon x264_template(integral_init8v_neon)
 void x264_integral_init8v_neon( uint16_t *, intptr_t );
 #define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
 void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
 #define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
 void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
 #define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
 void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
 #if !HIGH_BIT_DEPTH
 static void weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
        {
            w->weightfn = mc_offsetsub_wtab_neon;
            w->cachea[0] = -w->i_offset;
        }
        else
        {
            w->weightfn = mc_offsetadd_wtab_neon;
            w->cachea[0] = w->i_offset;
        }
    }
    else if( !w->i_denom )
        w->weightfn = mc_nodenom_wtab_neon;
    else
        w->weightfn = mc_wtab_neon;
 }
 static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
 {
    NULL,
    x264_pixel_avg2_w4_neon,
    x264_pixel_avg2_w8_neon,
    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
    x264_pixel_avg2_w16_neon,
    x264_pixel_avg2_w20_neon,
 };
 static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
 {
    NULL,
    x264_mc_copy_w4_neon,
    x264_mc_copy_w8_neon,
    NULL,
    x264_mc_copy_w16_neon,
 };
 static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
                          uint8_t *src[4], intptr_t i_src_stride,
                          int mvx, int mvy,
                          int i_width, int i_height, const x264_weight_t *weight )
 {
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
    if( (mvy&3) == 3 )             // explicit if() to force conditional add
        src1 += i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg_wtab_neon[i_width>>2](
                dst, i_dst_stride, src1, i_src_stride,
                src2, i_height );
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
    }
    else if( weight->weightfn )
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
    else
        mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
 }
 static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
                              uint8_t *src[4], intptr_t i_src_stride,
                              int mvx, int mvy,
                              int i_width, int i_height, const x264_weight_t *weight )
 {
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
    if( (mvy&3) == 3 )             // explicit if() to force conditional add
        src1 += i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg_wtab_neon[i_width>>2](
                dst, *i_dst_stride, src1, i_src_stride,
                src2, i_height );
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
 }
 static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
                              intptr_t stride, int width, int height, int16_t *buf )
 {
    intptr_t realign = (intptr_t)src & 15;
    src -= realign;
    dstv -= realign;
    dstc -= realign;
    dsth -= realign;
    width += realign;
    while( height-- )
    {
        x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
        x264_hpel_filter_c_neon( dstc, buf+8, width );
        x264_hpel_filter_h_neon( dsth, src, width );
        dsth += stride;
        dstv += stride;
        dstc += stride;
        src  += stride;
    }
 }
 PLANE_COPY(16, neon)
 PLANE_COPY_SWAP(16, neon)
 PLANE_INTERLEAVE(neon)
 PROPAGATE_LIST(neon)
 #endif // !HIGH_BIT_DEPTH
 void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf )
 {
    if( !(cpu&X264_CPU_ARMV6) )
        return;
 #if !HIGH_BIT_DEPTH
    pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
    pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
    pf->prefetch_ref  = x264_prefetch_ref_arm;
 #endif // !HIGH_BIT_DEPTH
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
    pf->plane_copy              = plane_copy_neon;
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
    pf->plane_copy_interleave = plane_copy_interleave_neon;
    pf->plane_copy_swap = plane_copy_swap_neon;
    pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
    pf->weight    = mc_wtab_neon;
    pf->offsetadd = mc_offsetadd_wtab_neon;
    pf->offsetsub = mc_offsetsub_wtab_neon;
    pf->weight_cache = weight_cache_neon;
    pf->mc_chroma = x264_mc_chroma_neon;
    pf->mc_luma = mc_luma_neon;
    pf->get_ref = get_ref_neon;
    pf->hpel_filter = hpel_filter_neon;
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
    pf->integral_init4h = x264_integral_init4h_neon;
    pf->integral_init8h = x264_integral_init8h_neon;
    pf->integral_init4v = x264_integral_init4v_neon;
    pf->integral_init8v = x264_integral_init8v_neon;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
    pf->mbtree_propagate_list = mbtree_propagate_list_neon;
    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
 #endif // !HIGH_BIT_DEPTH
 // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
 #ifndef SYS_MACOSX
    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
 #endif
    pf->memzero_aligned = x264_memzero_aligned_neon;
 }
--- a/common/arm/mc.h
+++ b/common/arm/mc.h
@@ -0,0 +1,32 @@
 /*****************************************************************************
 * mc.h: arm motion compensation
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_ARM_MC_H
 #define X264_ARM_MC_H
 #define x264_mc_init_arm x264_template(mc_init_arm)
 void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf );
 #endif
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -0,0 +1,160 @@
 /*****************************************************************************
 * pixel.h: arm pixel metrics
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_ARM_PIXEL_H
 #define X264_ARM_PIXEL_H
 #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
 #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
 #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
 #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
 #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
 #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
 #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
 #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
 #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
 #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
 #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
 #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
 #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
 #define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
 #define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
 #define x264_pixel_sad_4x4_armv6 x264_template(pixel_sad_4x4_armv6)
 #define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
 #define x264_pixel_sad_4x8_armv6 x264_template(pixel_sad_4x8_armv6)
 #define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
 #define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
 #define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
 #define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
 #define x264_pixel_sad_aligned_16x16_neon x264_template(pixel_sad_aligned_16x16_neon)
 #define x264_pixel_sad_aligned_16x16_neon_dual x264_template(pixel_sad_aligned_16x16_neon_dual)
 #define x264_pixel_sad_aligned_16x8_neon x264_template(pixel_sad_aligned_16x8_neon)
 #define x264_pixel_sad_aligned_16x8_neon_dual x264_template(pixel_sad_aligned_16x8_neon_dual)
 #define x264_pixel_sad_aligned_4x4_neon x264_template(pixel_sad_aligned_4x4_neon)
 #define x264_pixel_sad_aligned_4x8_neon x264_template(pixel_sad_aligned_4x8_neon)
 #define x264_pixel_sad_aligned_8x16_neon x264_template(pixel_sad_aligned_8x16_neon)
 #define x264_pixel_sad_aligned_8x16_neon_dual x264_template(pixel_sad_aligned_8x16_neon_dual)
 #define x264_pixel_sad_aligned_8x4_neon x264_template(pixel_sad_aligned_8x4_neon)
 #define x264_pixel_sad_aligned_8x4_neon_dual x264_template(pixel_sad_aligned_8x4_neon_dual)
 #define x264_pixel_sad_aligned_8x8_neon x264_template(pixel_sad_aligned_8x8_neon)
 #define x264_pixel_sad_aligned_8x8_neon_dual x264_template(pixel_sad_aligned_8x8_neon_dual)
 #define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
 #define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
 #define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
 #define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
 #define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
 #define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
 #define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
 #define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
 #define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
 #define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
 #define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
 #define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
 #define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
 #define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
 #define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
 #define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
 #define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
 #define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
 #define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
 #define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
 #define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
 #define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
 #define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
 #define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
 #define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
 #define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
 #define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
 #define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
 #define DECL_PIXELS( ret, name, suffix, args ) \
    ret x264_pixel_##name##_16x16_##suffix args;\
    ret x264_pixel_##name##_16x8_##suffix args;\
    ret x264_pixel_##name##_8x16_##suffix args;\
    ret x264_pixel_##name##_8x8_##suffix args;\
    ret x264_pixel_##name##_8x4_##suffix args;\
    ret x264_pixel_##name##_4x8_##suffix args;\
    ret x264_pixel_##name##_4x4_##suffix args;\
 #define DECL_X1( name, suffix ) \
    DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
 #define DECL_X4( name, suffix ) \
    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
 int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
 int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
 DECL_X1( sad, neon )
 DECL_X1( sad_aligned, neon )
 DECL_X1( sad_aligned, neon_dual )
 DECL_X4( sad, neon )
 DECL_X1( satd, neon )
 DECL_X1( ssd, neon )
 #define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
 void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
 #define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
 int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
 #define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
 int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
 #define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
 int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 #define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
 uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 #define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
 #define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
 uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
 #define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
 uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
 #define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
 int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
 #define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
 int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
 #define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
 uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
 #define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
 uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
 #define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
 uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
 #define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
 uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
 #define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
 void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
                                      const uint8_t *, intptr_t,
                                      int sums[2][4] );
 #define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
 float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
 #define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
 int x264_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
 #endif
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -0,0 +1,808 @@
 /*****************************************************************************
 * predict.S: arm intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Mans Rullgard <mans@mansr.com>
 *          Martin Storsjo <martin@martin.st>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 const p16weight, align=4
 .short 1,2,3,4,5,6,7,8
 endconst
 .text
 .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
 .if \n == 8 || \hi == 0
    vld1.8          {\rd[0]}, [\rs], \rt
    vld1.8          {\rd[1]}, [\rs], \rt
    vld1.8          {\rd[2]}, [\rs], \rt
    vld1.8          {\rd[3]}, [\rs], \rt
 .endif
 .if \n == 8 || \hi == 1
    vld1.8          {\rd[4]}, [\rs], \rt
    vld1.8          {\rd[5]}, [\rs], \rt
    vld1.8          {\rd[6]}, [\rs], \rt
    vld1.8          {\rd[7]}, [\rs], \rt
 .endif
 .endm
 .macro ldcol.16  rd1,  rd2,  rs,  rt,  ru
    add             \ru, \rs, \rt, lsl #3
    vld1.8          {\rd1[0]}, [\rs], \rt
    vld1.8          {\rd2[0]}, [\ru], \rt
    vld1.8          {\rd1[1]}, [\rs], \rt
    vld1.8          {\rd2[1]}, [\ru], \rt
    vld1.8          {\rd1[2]}, [\rs], \rt
    vld1.8          {\rd2[2]}, [\ru], \rt
    vld1.8          {\rd1[3]}, [\rs], \rt
    vld1.8          {\rd2[3]}, [\ru], \rt
    vld1.8          {\rd1[4]}, [\rs], \rt
    vld1.8          {\rd2[4]}, [\ru], \rt
    vld1.8          {\rd1[5]}, [\rs], \rt
    vld1.8          {\rd2[5]}, [\ru], \rt
    vld1.8          {\rd1[6]}, [\rs], \rt
    vld1.8          {\rd2[6]}, [\ru], \rt
    vld1.8          {\rd1[7]}, [\rs], \rt
    vld1.8          {\rd2[7]}, [\ru], \rt
 .endm
 .macro add16x8  dq,  dl,  dh,  rl,  rh
    vaddl.u8        \dq, \rl, \rh
    vadd.u16        \dl, \dl, \dh
    vpadd.u16       \dl, \dl, \dl
    vpadd.u16       \dl, \dl, \dl
 .endm
 // because gcc doesn't believe in using the free shift in add
 function predict_4x4_h_armv6
    ldrb    r1, [r0, #0*FDEC_STRIDE-1]
    ldrb    r2, [r0, #1*FDEC_STRIDE-1]
    ldrb    r3, [r0, #2*FDEC_STRIDE-1]
    ldrb    ip, [r0, #3*FDEC_STRIDE-1]
    add     r1, r1, r1, lsl #8
    add     r2, r2, r2, lsl #8
    add     r3, r3, r3, lsl #8
    add     ip, ip, ip, lsl #8
    add     r1, r1, r1, lsl #16
    str     r1, [r0, #0*FDEC_STRIDE]
    add     r2, r2, r2, lsl #16
    str     r2, [r0, #1*FDEC_STRIDE]
    add     r3, r3, r3, lsl #16
    str     r3, [r0, #2*FDEC_STRIDE]
    add     ip, ip, ip, lsl #16
    str     ip, [r0, #3*FDEC_STRIDE]
    bx      lr
 endfunc
 function predict_4x4_v_armv6
    ldr     r1,  [r0, #0 - 1 * FDEC_STRIDE]
    str     r1,  [r0, #0 + 0 * FDEC_STRIDE]
    str     r1,  [r0, #0 + 1 * FDEC_STRIDE]
    str     r1,  [r0, #0 + 2 * FDEC_STRIDE]
    str     r1,  [r0, #0 + 3 * FDEC_STRIDE]
    bx      lr
 endfunc
 function predict_4x4_dc_armv6
    mov     ip, #0
    ldr     r1, [r0, #-FDEC_STRIDE]
    ldrb    r2, [r0, #0*FDEC_STRIDE-1]
    ldrb    r3, [r0, #1*FDEC_STRIDE-1]
    usad8   r1, r1, ip
    add     r2, r2, #4
    ldrb    ip, [r0, #2*FDEC_STRIDE-1]
    add     r2, r2, r3
    ldrb    r3, [r0, #3*FDEC_STRIDE-1]
    add     r2, r2, ip
    add     r2, r2, r3
    add     r1, r1, r2
    lsr     r1, r1, #3
    add     r1, r1, r1, lsl #8
    add     r1, r1, r1, lsl #16
    str     r1, [r0, #0*FDEC_STRIDE]
    str     r1, [r0, #1*FDEC_STRIDE]
    str     r1, [r0, #2*FDEC_STRIDE]
    str     r1, [r0, #3*FDEC_STRIDE]
    bx      lr
 endfunc
 function predict_4x4_dc_top_neon
    mov         r12, #FDEC_STRIDE
    sub         r1, r0, #FDEC_STRIDE
    vld1.32     d1[], [r1,:32]
    vpaddl.u8   d1, d1
    vpadd.u16   d1, d1, d1
    vrshr.u16   d1, d1, #2
    vdup.8      d1, d1[0]
    vst1.32     d1[0], [r0,:32], r12
    vst1.32     d1[0], [r0,:32], r12
    vst1.32     d1[0], [r0,:32], r12
    vst1.32     d1[0], [r0,:32], r12
    bx          lr
 endfunc
 // return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
    uhadd8  \a1, \a1, \c1
    uhadd8  \a2, \a2, \c2
    uhadd8  \c1, \a1, \b1
    uhadd8  \c2, \a2, \b2
    eor     \a1, \a1, \b1
    eor     \a2, \a2, \b2
    and     \a1, \a1, \pb_1
    and     \a2, \a2, \pb_1
    uadd8   \a1, \a1, \c1
    uadd8   \a2, \a2, \c2
 .endm
 function predict_4x4_ddr_armv6
    ldr     r1, [r0, # -FDEC_STRIDE]
    ldrb    r2, [r0, # -FDEC_STRIDE-1]
    ldrb    r3, [r0, #0*FDEC_STRIDE-1]
    push    {r4-r6,lr}
    add     r2, r2, r1, lsl #8
    ldrb    r4, [r0, #1*FDEC_STRIDE-1]
    add     r3, r3, r2, lsl #8
    ldrb    r5, [r0, #2*FDEC_STRIDE-1]
    ldrb    r6, [r0, #3*FDEC_STRIDE-1]
    add     r4, r4, r3, lsl #8
    add     r5, r5, r4, lsl #8
    add     r6, r6, r5, lsl #8
    ldr     ip, =0x01010101
    PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
    str     r1, [r0, #0*FDEC_STRIDE]
    lsl     r2, r1, #8
    lsl     r3, r1, #16
    lsl     r4, r4, #8
    lsl     r5, r1, #24
    add     r2, r2, r4, lsr #24
    str     r2, [r0, #1*FDEC_STRIDE]
    add     r3, r3, r4, lsr #16
    str     r3, [r0, #2*FDEC_STRIDE]
    add     r5, r5, r4, lsr #8
    str     r5, [r0, #3*FDEC_STRIDE]
    pop     {r4-r6,pc}
 endfunc
 function predict_4x4_ddl_neon
    sub         r0, #FDEC_STRIDE
    mov         ip, #FDEC_STRIDE
    vld1.64     {d0}, [r0], ip
    vdup.8      d3, d0[7]
    vext.8      d1, d0, d0, #1
    vext.8      d2, d0, d3, #2
    vhadd.u8    d0, d0, d2
    vrhadd.u8   d0, d0, d1
    vst1.32     {d0[0]}, [r0,:32], ip
    vext.8      d1, d0, d0, #1
    vext.8      d2, d0, d0, #2
    vst1.32     {d1[0]}, [r0,:32], ip
    vext.8      d3, d0, d0, #3
    vst1.32     {d2[0]}, [r0,:32], ip
    vst1.32     {d3[0]}, [r0,:32], ip
    bx          lr
 endfunc
 function predict_8x8_dc_neon
    mov     ip, #0
    ldrd    r2, r3, [r1, #8]
    push    {r4-r5,lr}
    ldrd    r4, r5, [r1, #16]
    lsl     r3, r3, #8
    ldrb    lr, [r1, #7]
    usad8   r2, r2, ip
    usad8   r3, r3, ip
    usada8  r2, r4, ip, r2
    add     lr, lr, #8
    usada8  r3, r5, ip, r3
    add     r2, r2, lr
    mov     ip, #FDEC_STRIDE
    add     r2, r2, r3
    lsr     r2, r2, #4
    vdup.8  d0, r2
 .rept 8
    vst1.64 {d0}, [r0,:64], ip
 .endr
    pop     {r4-r5,pc}
 endfunc
 function predict_8x8_h_neon
    add         r1, r1, #7
    mov         ip, #FDEC_STRIDE
    vld1.64     {d16}, [r1]
    vdup.8      d0, d16[7]
    vdup.8      d1, d16[6]
    vst1.64     {d0}, [r0,:64], ip
    vdup.8      d2, d16[5]
    vst1.64     {d1}, [r0,:64], ip
    vdup.8      d3, d16[4]
    vst1.64     {d2}, [r0,:64], ip
    vdup.8      d4, d16[3]
    vst1.64     {d3}, [r0,:64], ip
    vdup.8      d5, d16[2]
    vst1.64     {d4}, [r0,:64], ip
    vdup.8      d6, d16[1]
    vst1.64     {d5}, [r0,:64], ip
    vdup.8      d7, d16[0]
    vst1.64     {d6}, [r0,:64], ip
    vst1.64     {d7}, [r0,:64], ip
    bx          lr
 endfunc
 function predict_8x8_v_neon
    add         r1, r1, #16
    mov         r12, #FDEC_STRIDE
    vld1.8      {d0}, [r1,:64]
 .rept 8
    vst1.8      {d0}, [r0,:64], r12
 .endr
    bx          lr
 endfunc
 function predict_8x8_ddl_neon
    add         r1, #16
    vld1.8      {d0, d1}, [r1,:128]
    vmov.i8     q3, #0
    vrev64.8    d2, d1
    vext.8      q8, q3, q0, #15
    vext.8      q2, q0, q1, #1
    vhadd.u8    q8, q2
    mov         r12, #FDEC_STRIDE
    vrhadd.u8   q0, q8
    vext.8      d2, d0, d1, #1
    vext.8      d3, d0, d1, #2
    vst1.8      d2, [r0,:64], r12
    vext.8      d2, d0, d1, #3
    vst1.8      d3, [r0,:64], r12
    vext.8      d3, d0, d1, #4
    vst1.8      d2, [r0,:64], r12
    vext.8      d2, d0, d1, #5
    vst1.8      d3, [r0,:64], r12
    vext.8      d3, d0, d1, #6
    vst1.8      d2, [r0,:64], r12
    vext.8      d2, d0, d1, #7
    vst1.8      d3, [r0,:64], r12
    vst1.8      d2, [r0,:64], r12
    vst1.8      d1, [r0,:64], r12
    bx          lr
 endfunc
 function predict_8x8_ddr_neon
    vld1.8      {d0-d3}, [r1,:128]
    vext.8      q2, q0, q1, #7
    vext.8      q3, q0, q1, #9
    vhadd.u8    q2, q2, q3
    vrhadd.u8   d0, d1, d4
    vrhadd.u8   d1, d2, d5
    add         r0, #7*FDEC_STRIDE
    mov         r12, #-1*FDEC_STRIDE
    vext.8      d2, d0, d1, #1
    vst1.8      {d0}, [r0,:64], r12
    vext.8      d4, d0, d1, #2
    vst1.8      {d2}, [r0,:64], r12
    vext.8      d5, d0, d1, #3
    vst1.8      {d4}, [r0,:64], r12
    vext.8      d4, d0, d1, #4
    vst1.8      {d5}, [r0,:64], r12
    vext.8      d5, d0, d1, #5
    vst1.8      {d4}, [r0,:64], r12
    vext.8      d4, d0, d1, #6
    vst1.8      {d5}, [r0,:64], r12
    vext.8      d5, d0, d1, #7
    vst1.8      {d4}, [r0,:64], r12
    vst1.8      {d5}, [r0,:64], r12
    bx          lr
 endfunc
 function predict_8x8_vl_neon
    add         r1, #16
    mov         r12, #FDEC_STRIDE
    vld1.8      {d0, d1}, [r1,:128]
    vext.8      q1, q1, q0, #15
    vext.8      q2, q0, q2, #1
    vrhadd.u8   q3, q0, q2
    vhadd.u8    q1, q1, q2
    vrhadd.u8   q0, q0, q1
    vext.8      d2, d0, d1, #1
    vst1.8      {d6}, [r0,:64], r12
    vext.8      d3, d6, d7, #1
    vst1.8      {d2}, [r0,:64], r12
    vext.8      d2, d0, d1, #2
    vst1.8      {d3}, [r0,:64], r12
    vext.8      d3, d6, d7, #2
    vst1.8      {d2}, [r0,:64], r12
    vext.8      d2, d0, d1, #3
    vst1.8      {d3}, [r0,:64], r12
    vext.8      d3, d6, d7, #3
    vst1.8      {d2}, [r0,:64], r12
    vext.8      d2, d0, d1, #4
    vst1.8      {d3}, [r0,:64], r12
    vst1.8      {d2}, [r0,:64], r12
    bx          lr
 endfunc
 function predict_8x8_vr_neon
    add         r1, #8
    mov         r12, #FDEC_STRIDE
    vld1.8      {d4,d5}, [r1,:64]
    vext.8      q1, q2, q2, #14
    vext.8      q0, q2, q2, #15
    vhadd.u8    q3, q2, q1
    vrhadd.u8   q2, q2, q0
    vrhadd.u8   q0, q0, q3
    vmov        d2, d0
    vst1.8      {d5}, [r0,:64], r12
    vuzp.8      d2, d0
    vst1.8      {d1}, [r0,:64], r12
    vext.8      d6, d0, d5, #7
    vext.8      d3, d2, d1, #7
    vst1.8      {d6}, [r0,:64], r12
    vst1.8      {d3}, [r0,:64], r12
    vext.8      d6, d0, d5, #6
    vext.8      d3, d2, d1, #6
    vst1.8      {d6}, [r0,:64], r12
    vst1.8      {d3}, [r0,:64], r12
    vext.8      d6, d0, d5, #5
    vext.8      d3, d2, d1, #5
    vst1.8      {d6}, [r0,:64], r12
    vst1.8      {d3}, [r0,:64], r12
    bx          lr
 endfunc
 function predict_8x8_hd_neon
    mov         r12, #FDEC_STRIDE
    add         r1, #7
    vld1.8      {d2,d3}, [r1]
    vext.8      q3, q1, q1, #1
    vext.8      q2, q1, q1, #2
    vrhadd.u8   q8, q1, q3
    vhadd.u8    q1, q2
    vrhadd.u8   q0, q1, q3
    vzip.8      d16, d0
    vext.8      d2, d0, d1, #6
    vext.8      d3, d0, d1, #4
    vst1.8      {d2}, [r0,:64], r12
    vext.8      d2, d0, d1, #2
    vst1.8      {d3}, [r0,:64], r12
    vst1.8      {d2}, [r0,:64], r12
    vext.8      d2, d16, d0, #6
    vst1.8      {d0}, [r0,:64], r12
    vext.8      d3, d16, d0, #4
    vst1.8      {d2}, [r0,:64], r12
    vext.8      d2, d16, d0, #2
    vst1.8      {d3}, [r0,:64], r12
    vst1.8      {d2}, [r0,:64], r12
    vst1.8      {d16}, [r0,:64], r12
    bx          lr
 endfunc
 function predict_8x8_hu_neon
    mov         r12, #FDEC_STRIDE
    add         r1, #7
    vld1.8      {d7}, [r1]
    vdup.8      d6, d7[0]
    vrev64.8    d7, d7
    vext.8      d4, d7, d6, #2
    vext.8      d2, d7, d6, #1
    vhadd.u8    d16, d7, d4
    vrhadd.u8   d0, d2, d7
    vrhadd.u8   d1, d16, d2
    vzip.8      d0, d1
    vdup.16     q1, d1[3]
    vext.8      q2, q0, q1, #2
    vext.8      q3, q0, q1, #4
    vext.8      q8, q0, q1, #6
    vst1.8      {d0}, [r0,:64], r12
    vst1.8      {d4}, [r0,:64], r12
    vst1.8      {d6}, [r0,:64], r12
    vst1.8      {d16}, [r0,:64], r12
    vst1.8      {d1}, [r0,:64], r12
    vst1.8      {d5}, [r0,:64], r12
    vst1.8      {d7}, [r0,:64], r12
    vst1.8      {d17}, [r0,:64]
    bx          lr
 endfunc
 function predict_8x8c_dc_top_neon
    sub         r2,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    vld1.8      {d0}, [r2,:64]
    vpaddl.u8   d0,  d0
    vpadd.u16   d0,  d0,  d0
    vrshrn.u16  d0,  q0,  #2
    vdup.8      d1,  d0[1]
    vdup.8      d0,  d0[0]
    vtrn.32     d0,  d1
    b           pred8x8_dc_end
 endfunc
 function predict_8x8c_dc_left_neon
    mov         r1,  #FDEC_STRIDE
    sub         r2,  r0,  #1
    ldcol.8     d0,  r2,  r1
    vpaddl.u8   d0,  d0
    vpadd.u16   d0,  d0,  d0
    vrshrn.u16  d0,  q0,  #2
    vdup.8      d1,  d0[1]
    vdup.8      d0,  d0[0]
    b           pred8x8_dc_end
 endfunc
 function predict_8x8c_dc_neon
    sub         r2,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    vld1.8      {d0}, [r2,:64]
    sub         r2,  r0,  #1
    ldcol.8     d1,  r2,  r1
    vtrn.32     d0,  d1
    vpaddl.u8   q0,  q0
    vpadd.u16   d0,  d0,  d1
    vpadd.u16   d1,  d0,  d0
    vrshrn.u16  d2,  q0,  #3
    vrshrn.u16  d3,  q0,  #2
    vdup.8      d0,  d2[4]
    vdup.8      d1,  d3[3]
    vdup.8      d4,  d3[2]
    vdup.8      d5,  d2[5]
    vtrn.32     q0,  q2
 pred8x8_dc_end:
    add         r2,  r0,  r1,  lsl #2
 .rept 4
    vst1.8      {d0}, [r0,:64], r1
    vst1.8      {d1}, [r2,:64], r1
 .endr
    bx          lr
 endfunc
 function predict_8x8c_h_neon
    sub         r1, r0, #1
    mov         ip, #FDEC_STRIDE
 .rept 4
    vld1.8      {d0[]}, [r1], ip
    vld1.8      {d2[]}, [r1], ip
    vst1.64     {d0}, [r0,:64], ip
    vst1.64     {d2}, [r0,:64], ip
 .endr
    bx          lr
 endfunc
 function predict_8x8c_v_neon
    sub         r0, r0, #FDEC_STRIDE
    mov         ip, #FDEC_STRIDE
    vld1.64     {d0}, [r0,:64], ip
 .rept 8
    vst1.64     {d0}, [r0,:64], ip
 .endr
    bx          lr
 endfunc
 function predict_8x8c_p_neon
    sub         r3,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    add         r2,  r3,  #4
    sub         r3,  r3,  #1
    vld1.32     {d0[0]}, [r3]
    vld1.32     {d2[0]}, [r2,:32], r1
    ldcol.8     d0,  r3,  r1,  4,  hi=1
    add         r3,  r3,  r1
    ldcol.8     d3,  r3,  r1,  4
    vaddl.u8    q8,  d2,  d3
    vrev32.8    d0,  d0
    vtrn.32     d2,  d3
    vsubl.u8    q2,  d2,  d0
    movrel      r3,  p16weight
    vld1.16     {q0}, [r3,:128]
    vmul.s16    d4,  d4,  d0
    vmul.s16    d5,  d5,  d0
    vpadd.i16   d4,  d4,  d5
    vpaddl.s16  d4,  d4
    vshl.i32    d5,  d4,  #4
    vadd.s32    d4,  d4,  d5
    vrshrn.s32  d4,  q2,  #5
    mov         r3,  #0
    vtrn.16     d4,  d5
    vadd.i16    d2,  d4,  d5
    vshl.i16    d3,  d2,  #2
    vrev64.16   d16, d16
    vsub.i16    d3,  d3,  d2
    vadd.i16    d16, d16, d0
    vshl.i16    d2,  d16, #4
    vsub.i16    d2,  d2,  d3
    vext.16     q0,  q0,  q0,  #7
    vmov.16     d0[0], r3
    vmul.i16    q0,  q0,  d4[0]
    vdup.16     q1,  d2[0]
    vdup.16     q3,  d5[0]
    vadd.i16    q1,  q1,  q0
    mov         r3,  #8
 1:
    vqshrun.s16 d0,  q1,  #5
    vadd.i16    q1,  q1,  q3
    vst1.8      {d0}, [r0,:64], r1
    subs        r3,  r3,  #1
    bne         1b
    bx          lr
 endfunc
 function predict_8x16c_dc_top_neon
    sub         r2,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    vld1.8      {d0}, [r2,:64]
    vpaddl.u8   d0,  d0
    vpadd.u16   d0,  d0,  d0
    vrshrn.u16  d0,  q0,  #2
    vdup.8      d1,  d0[1]
    vdup.8      d0,  d0[0]
    vtrn.32     d0,  d1
    add         r2,  r0,  r1,  lsl #2
 .rept 4
    vst1.8      {d0}, [r0,:64], r1
    vst1.8      {d1}, [r2,:64], r1
 .endr
    add         r2,  r2,  r1,  lsl #2
    add         r0,  r0,  r1,  lsl #2
 .rept 4
    vst1.8      {d0}, [r0,:64], r1
    vst1.8      {d1}, [r2,:64], r1
 .endr
    bx          lr
 endfunc
 function predict_8x16c_h_neon
    sub         r1, r0, #1
    mov         ip, #FDEC_STRIDE
 .rept 8
    vld1.8      {d0[]}, [r1], ip
    vld1.8      {d2[]}, [r1], ip
    vst1.64     {d0}, [r0,:64], ip
    vst1.64     {d2}, [r0,:64], ip
 .endr
    bx          lr
 endfunc
 function predict_8x16c_p_neon
    sub         r3,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    add         r2,  r3,  #4
    sub         r3,  r3,  #1
    vld1.32     {d0[0]}, [r3]
    vld1.32     {d2[0]}, [r2,:32], r1
    ldcol.8     d1,  r3,  r1
    add         r3,  r3,  r1
    ldcol.8     d3,  r3,  r1
    vrev64.32   d16, d3
    vaddl.u8    q8,  d2,  d16
    vrev32.8    d0,  d0
    vsubl.u8    q2,  d2,  d0
    vrev64.8    d1,  d1
    vsubl.u8    q3,  d3,  d1
    movrel      r3,  p16weight
    vld1.16     {q0}, [r3,:128]
    vmul.s16    d4,  d4,  d0
    vmul.s16    q3,  q3,  q0
    vpadd.i16   d4,  d4,  d5
    vpadd.i16   d6,  d6,  d7
    vpaddl.s16  d4,  d4        @ d4[0] = H
    vpaddl.s16  d6,  d6
    vpadd.s32   d6,  d6        @ d6[0] = V
    vshl.i32    d5,  d4,  #4
    vadd.s32    d4,  d4,  d5   @ d4[0] = 17*H
    vshl.i32    d7,  d6,  #2
    vrshrn.s32  d4,  q2,  #5   @ d4[0] = b
    vadd.s32    d6,  d6,  d7   @ d6[0] = 5*V
    vrshrn.s32  d6,  q3,  #6   @ d6[0] = c
    mov         r3,  #0
    vshl.i16    d3,  d4,  #2
    vsub.i16    d3,  d3,  d4   @ d2[0] = 3 * b
    vshl.i16    d2,  d6,  #3
    vadd.i16    d3,  d3,  d2   @ d2[0] = 3 * b + 8 * c
    vsub.i16    d3,  d3,  d6   @ d2[0] = 3 * b + 7 * c
    vrev64.16   d16, d16
    vadd.i16    d16, d16, d0   @ d16[0] = src[]+src[] + 1
    vshl.i16    d2,  d16, #4   @ d3[0] = a + 16
    vsub.i16    d2,  d2,  d3   @ i00
    vext.16     q0,  q0,  q0,  #7
    vmov.16     d0[0], r3
    vmul.i16    q0,  q0,  d4[0]
    vdup.16     q1,  d2[0]
    vdup.16     q3,  d6[0]
    vadd.i16    q1,  q1,  q0
    mov         r3,  #16
 1:
    vqshrun.s16 d0,  q1,  #5
    vadd.i16    q1,  q1,  q3
    vst1.8      {d0}, [r0,:64], r1
    subs        r3,  r3,  #1
    bne         1b
    bx          lr
 endfunc
 function predict_16x16_dc_top_neon
    sub         r2,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    vld1.8      {q0}, [r2,:128]
    add16x8     q0,  d0,  d1,  d0,  d1
    vrshrn.u16  d0,  q0,  #4
    vdup.8      q0,  d0[0]
    b           pred16x16_dc_end
 endfunc
 function predict_16x16_dc_left_neon
    mov         r1,  #FDEC_STRIDE
    sub         r2,  r0,  #1
    ldcol.8     d0,  r2,  r1
    ldcol.8     d1,  r2,  r1
    add16x8     q0,  d0,  d1,  d0,  d1
    vrshrn.u16  d0,  q0,  #4
    vdup.8      q0,  d0[0]
    b           pred16x16_dc_end
 endfunc
 function predict_16x16_dc_neon
    sub         r3, r0, #FDEC_STRIDE
    sub         r0, r0, #1
    vld1.64     {d0-d1}, [r3,:128]
    ldrb        ip, [r0], #FDEC_STRIDE
    vaddl.u8    q0, d0, d1
    ldrb        r1, [r0], #FDEC_STRIDE
    vadd.u16    d0, d0, d1
    vpadd.u16   d0, d0, d0
    vpadd.u16   d0, d0, d0
 .rept 4
    ldrb        r2, [r0], #FDEC_STRIDE
    add         ip, ip, r1
    ldrb        r3, [r0], #FDEC_STRIDE
    add         ip, ip, r2
    ldrb        r1, [r0], #FDEC_STRIDE
    add         ip, ip, r3
 .endr
    ldrb        r2, [r0], #FDEC_STRIDE
    add         ip, ip, r1
    ldrb        r3, [r0], #FDEC_STRIDE
    add         ip, ip, r2
    sub         r0, r0, #FDEC_STRIDE*16
    add         ip, ip, r3
    vdup.16     d1, ip
    vadd.u16    d0, d0, d1
    mov         r1, #FDEC_STRIDE
    add         r0, r0, #1
    vrshr.u16   d0, d0, #5
    vdup.8      q0, d0[0]
 pred16x16_dc_end:
 .rept 16
    vst1.64     {d0-d1}, [r0,:128], r1
 .endr
    bx          lr
 endfunc
 function predict_16x16_h_neon
    sub         r1, r0, #1
    mov         ip, #FDEC_STRIDE
 .rept 8
    vld1.8      {d0[]}, [r1], ip
    vmov        d1, d0
    vld1.8      {d2[]}, [r1], ip
    vmov        d3, d2
    vst1.64     {d0-d1}, [r0,:128], ip
    vst1.64     {d2-d3}, [r0,:128], ip
 .endr
    bx          lr
 endfunc
 function predict_16x16_v_neon
    sub         r0, r0, #FDEC_STRIDE
    mov         ip, #FDEC_STRIDE
    vld1.64     {d0-d1}, [r0,:128], ip
 .rept 16
    vst1.64     {d0-d1}, [r0,:128], ip
 .endr
    bx          lr
 endfunc
 function predict_16x16_p_neon
    sub         r3,  r0,  #FDEC_STRIDE
    mov         r1,  #FDEC_STRIDE
    add         r2,  r3,  #8
    sub         r3,  r3,  #1
    vld1.8      {d0}, [r3]
    vld1.8      {d2}, [r2,:64], r1
    ldcol.8     d1,  r3,  r1
    add         r3,  r3,  r1
    ldcol.8     d3,  r3,  r1
    vrev64.8    q0,  q0
    vaddl.u8    q8,  d2,  d3
    vsubl.u8    q2,  d2,  d0
    vsubl.u8    q3,  d3,  d1
    movrel      r3,  p16weight
    vld1.8      {q0}, [r3,:128]
    vmul.s16    q2,  q2,  q0
    vmul.s16    q3,  q3,  q0
    vadd.i16    d4,  d4,  d5
    vadd.i16    d5,  d6,  d7
    vpadd.i16   d4,  d4,  d5
    vpadd.i16   d4,  d4,  d4
    vshll.s16   q3,  d4,  #2
    vaddw.s16   q2,  q3,  d4
    vrshrn.s32  d4,  q2,  #6
    mov         r3,  #0
    vtrn.16     d4,  d5
    vadd.i16    d2,  d4,  d5
    vshl.i16    d3,  d2,  #3
    vrev64.16   d16, d17
    vsub.i16    d3,  d3,  d2
    vadd.i16    d16, d16, d0
    vshl.i16    d2,  d16, #4
    vsub.i16    d2,  d2,  d3
    vshl.i16    d3,  d4,  #4
    vext.16     q0,  q0,  q0,  #7
    vsub.i16    d6,  d5,  d3
    vmov.16     d0[0], r3
    vmul.i16    q0,  q0,  d4[0]
    vdup.16     q1,  d2[0]
    vdup.16     q2,  d4[0]
    vdup.16     q3,  d6[0]
    vshl.i16    q2,  q2,  #3
    vadd.i16    q1,  q1,  q0
    vadd.i16    q3,  q3,  q2
    mov         r3,  #16
 1:
    vqshrun.s16 d0,  q1,  #5
    vadd.i16    q1,  q1,  q2
    vqshrun.s16 d1,  q1,  #5
    vadd.i16    q1,  q1,  q3
    vst1.8      {q0}, [r0,:128], r1
    subs        r3,  r3,  #1
    bne         1b
    bx          lr
 endfunc
--- a/common/arm/predict-c.c
+++ b/common/arm/predict-c.c
@@ -0,0 +1,108 @@
 /*****************************************************************************
 * predict.c: arm intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "predict.h"
 #include "pixel.h"
 void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] )
 {
    if( !(cpu&X264_CPU_ARMV6) )
        return;
 #if !HIGH_BIT_DEPTH
    pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
    pf[I_PRED_4x4_V]   = x264_predict_4x4_v_armv6;
    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
    if( !(cpu&X264_CPU_NEON) )
        return;
    pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
 {
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
    pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
    pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
    pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
 {
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    /* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_neon;
    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_neon;
    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_neon;
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
 {
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
    pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
    pf[I_PRED_8x8_VL]  = x264_predict_8x8_vl_neon;
    pf[I_PRED_8x8_VR]  = x264_predict_8x8_vr_neon;
    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
    pf[I_PRED_8x8_HD]  = x264_predict_8x8_hd_neon;
    pf[I_PRED_8x8_HU]  = x264_predict_8x8_hu_neon;
    pf[I_PRED_8x8_V]   = x264_predict_8x8_v_neon;
 #endif // !HIGH_BIT_DEPTH
 }
 void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] )
 {
    if( !(cpu&X264_CPU_NEON) )
        return;
 #if !HIGH_BIT_DEPTH
    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
 #endif // !HIGH_BIT_DEPTH
 }
--- a/common/arm/predict.h
+++ b/common/arm/predict.h
@@ -0,0 +1,105 @@
 /*****************************************************************************
 * predict.h: arm intra prediction
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_ARM_PREDICT_H
 #define X264_ARM_PREDICT_H
 #define x264_predict_4x4_dc_armv6 x264_template(predict_4x4_dc_armv6)
 void x264_predict_4x4_dc_armv6( uint8_t *src );
 #define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
 void x264_predict_4x4_dc_top_neon( uint8_t *src );
 #define x264_predict_4x4_v_armv6 x264_template(predict_4x4_v_armv6)
 void x264_predict_4x4_v_armv6( uint8_t *src );
 #define x264_predict_4x4_h_armv6 x264_template(predict_4x4_h_armv6)
 void x264_predict_4x4_h_armv6( uint8_t *src );
 #define x264_predict_4x4_ddr_armv6 x264_template(predict_4x4_ddr_armv6)
 void x264_predict_4x4_ddr_armv6( uint8_t *src );
 #define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
 void x264_predict_4x4_ddl_neon( uint8_t *src );
 #define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
 void x264_predict_8x8c_dc_neon( uint8_t *src );
 #define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
 void x264_predict_8x8c_dc_top_neon( uint8_t *src );
 #define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
 void x264_predict_8x8c_dc_left_neon( uint8_t *src );
 #define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
 void x264_predict_8x8c_h_neon( uint8_t *src );
 #define x264_predict_8x8c_v_neon x264_template(predict_8x8c_v_neon)
 void x264_predict_8x8c_v_neon( uint8_t *src );
 #define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
 void x264_predict_8x8c_p_neon( uint8_t *src );
 #define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
 void x264_predict_8x16c_h_neon( uint8_t *src );
 #define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
 void x264_predict_8x16c_dc_top_neon( uint8_t *src );
 #define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
 void x264_predict_8x16c_p_neon( uint8_t *src );
 #define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
 void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
 void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
 void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
 void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
 void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
 void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
 void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
 void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
 void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
 #define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
 void x264_predict_16x16_dc_neon( uint8_t *src );
 #define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
 void x264_predict_16x16_dc_top_neon( uint8_t *src );
 #define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
 void x264_predict_16x16_dc_left_neon( uint8_t *src );
 #define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
 void x264_predict_16x16_h_neon( uint8_t *src );
 #define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
 void x264_predict_16x16_v_neon( uint8_t *src );
 #define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
 void x264_predict_16x16_p_neon( uint8_t *src );
 #define x264_predict_4x4_init_arm x264_template(predict_4x4_init_arm)
 void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] );
 #define x264_predict_8x8_init_arm x264_template(predict_8x8_init_arm)
 void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 #define x264_predict_8x8c_init_arm x264_template(predict_8x8c_init_arm)
 void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
 #define x264_predict_8x16c_init_arm x264_template(predict_8x16c_init_arm)
 void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
 #define x264_predict_16x16_init_arm x264_template(predict_16x16_init_arm)
 void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] );
 #endif
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -0,0 +1,574 @@
 /****************************************************************************
 * quant.S: arm quantization and level-run
 *****************************************************************************
 * Copyright (C) 2009-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *          Janne Grunau <janne-x264@jannau.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "asm.S"
 const pmovmskb_byte, align=4
 .byte 1,2,4,8,16,32,64,128
 .byte 1,2,4,8,16,32,64,128
 endconst
 const mask_2bit, align=4
 .byte 3,12,48,192,3,12,48,192
 .byte 3,12,48,192,3,12,48,192
 endconst
 const mask_1bit, align=4
 .byte 128,64,32,16,8,4,2,1
 .byte 128,64,32,16,8,4,2,1
 endconst
 .text
 .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
    vadd.u16    q8,  q8,  \bias0
    vadd.u16    q9,  q9,  \bias1
 .ifc \load_mf, yes
    vld1.64     {\mf0-\mf3}, [r1,:128]!
 .endif
    vmull.u16   q10, d16, \mf0
    vmull.u16   q11, d17, \mf1
    vmull.u16   q12, d18, \mf2
    vmull.u16   q13, d19, \mf3
    vshr.s16    q14, q14, #15
    vshr.s16    q15, q15, #15
    vshrn.u32   d16, q10, #16
    vshrn.u32   d17, q11, #16
    vshrn.u32   d18, q12, #16
    vshrn.u32   d19, q13, #16
    veor        q8,  q8,  q14
    veor        q9,  q9,  q15
    vsub.s16    q8,  q8,  q14
    vsub.s16    q9,  q9,  q15
    vorr        \mask, q8,  q9
    vst1.64     {d16-d19}, [r0,:128]!
 .endm
 .macro QUANT_END d
    vmov        r2,  r3,  \d
    orrs        r0,  r2,  r3
    movne       r0,  #1
    bx          lr
 .endm
 // quant_2x2_dc( int16_t dct[4], int mf, int bias )
 function quant_2x2_dc_neon
    vld1.64     {d0}, [r0,:64]
    vabs.s16    d3,  d0
    vdup.16     d2,  r2
    vdup.16     d1,  r1
    vadd.u16    d3,  d3,  d2
    vmull.u16   q3,  d3,  d1
    vshr.s16    d0,  d0,  #15
    vshrn.u32   d3,  q3,  #16
    veor        d3,  d3,  d0
    vsub.s16    d3,  d3,  d0
    vst1.64     {d3}, [r0,:64]
    QUANT_END   d3
 endfunc
 // quant_4x4_dc( int16_t dct[16], int mf, int bias )
 function quant_4x4_dc_neon
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vdup.16     q0,  r2
    vdup.16     q2,  r1
    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5,  q0
    vorr        d0,  d0,  d1
    QUANT_END   d0
 endfunc
 // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 function quant_4x4_neon
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vld1.64     {d0-d3}, [r2,:128]
    vld1.64     {d4-d7}, [r1,:128]
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7, q0
    vorr        d0,  d0,  d1
    QUANT_END   d0
 endfunc
 // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
 function quant_4x4x4_neon
    vpush       {d8-d15}
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vld1.64     {d0-d3},   [r2,:128]
    vld1.64     {d4-d7},   [r1,:128]
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q4
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q5
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q6
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q7
    vorr        d8,  d8,  d9
    vorr        d10, d10, d11
    vorr        d12, d12, d13
    vorr        d14, d14, d15
    vmov        r0,  r1,  d8
    vmov        r2,  r3, d10
    orrs        r0,  r1
    movne       r0,  #1
    orrs        r2,  r3
    orrne       r0,  #2
    vmov        r1,  r2, d12
    vmov        r3,  ip, d14
    orrs        r1,  r2
    orrne       r0,  #4
    orrs        r3,  ip
    orrne       r0,  #8
    vpop        {d8-d15}
    bx          lr
 endfunc
 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 function quant_8x8_neon
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vld1.64     {d0-d3},   [r2,:128]!
    vld1.64     {d4-d7},   [r1,:128]!
    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q0
 .rept 3
    vld1.64     {d28-d31}, [r0,:128]
    vabs.s16    q8,  q14
    vabs.s16    q9,  q15
    vld1.64     {d2-d5},   [r2,:128]!
    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7,  q1, yes
    vorr        q0,  q0,  q1
 .endr
    vorr        d0,  d0,  d1
    QUANT_END   d0
 endfunc
 .macro DEQUANT_START mf_size offset dc=no
    mov         r3,  #0x2b
    mul         r3,  r3,  r2
    lsr         r3,  r3,  #8            // i_qbits = i_qp / 6
    add         ip,  r3,  r3,  lsl #1
    sub         r2,  r2,  ip,  lsl #1   // i_mf = i_qp % 6
 .ifc \dc,no
    add         r1,  r1,  r2, lsl #\mf_size  // dequant_mf[i_mf]
 .else
    ldr         r1, [r1,  r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
 .endif
    subs        r3,  r3,  #\offset      // 6 for 8x8
 .endm
 // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 .macro DEQUANT size bits
 function dequant_\size\()_neon
    DEQUANT_START \bits+2, \bits
 .ifc \size, 8x8
    mov         r2,  #4
 .endif
    blt         dequant_\size\()_rshift
    vdup.16     q15, r3
 dequant_\size\()_lshift_loop:
 .ifc \size, 8x8
    subs        r2,  r2,  #1
 .endif
    vld1.32     {d16-d17}, [r1,:128]!
    vld1.32     {d18-d19}, [r1,:128]!
    vmovn.s32   d4,  q8
    vld1.32     {d20-d21}, [r1,:128]!
    vmovn.s32   d5,  q9
    vld1.32     {d22-d23}, [r1,:128]!
    vmovn.s32   d6,  q10
    vld1.16     {d0-d3},   [r0,:128]
    vmovn.s32   d7,  q11
    vmul.s16    q0,  q0,  q2
    vmul.s16    q1,  q1,  q3
    vshl.s16    q0,  q0,  q15
    vshl.s16    q1,  q1,  q15
    vst1.16     {d0-d3},   [r0,:128]!
 .ifc \size, 8x8
    bgt         dequant_\size\()_lshift_loop
 .endif
    bx          lr
 dequant_\size\()_rshift:
    vdup.32     q15, r3
    rsb         r3,  r3,  #0
    mov         ip,  #1
    sub         r3,  r3,  #1
    lsl         ip,  ip,  r3
 .ifc \size, 8x8
 dequant_\size\()_rshift_loop:
    subs        r2,  r2,  #1
 .endif
    vdup.32     q10, ip
    vld1.32     {d16-d17}, [r1,:128]!
    vdup.32     q11, ip
    vld1.32     {d18-d19}, [r1,:128]!
    vmovn.s32   d4,  q8
    vld1.32     {d16-d17}, [r1,:128]!
    vmovn.s32   d5,  q9
    vld1.32     {d18-d19}, [r1,:128]!
    vmovn.s32   d6,  q8
    vld1.16     {d0-d3},   [r0,:128]
    vmovn.s32   d7,  q9
    vdup.32     q12, ip
    vdup.32     q13, ip
    vmlal.s16   q10, d0,  d4
    vmlal.s16   q11, d1,  d5
    vmlal.s16   q12, d2,  d6
    vmlal.s16   q13, d3,  d7
    vshl.s32    q10, q10, q15
    vshl.s32    q11, q11, q15
    vshl.s32    q12, q12, q15
    vshl.s32    q13, q13, q15
    vmovn.s32   d0,  q10
    vmovn.s32   d1,  q11
    vmovn.s32   d2,  q12
    vmovn.s32   d3,  q13
    vst1.16     {d0-d3},   [r0,:128]!
 .ifc \size, 8x8
    bgt         dequant_\size\()_rshift_loop
 .endif
    bx          lr
 endfunc
 .endm
 DEQUANT 4x4, 4
 DEQUANT 8x8, 6
 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 function dequant_4x4_dc_neon
    DEQUANT_START 6, 6, yes
    blt         dequant_4x4_dc_rshift
    lsl         r1,  r1,  r3
    vdup.16     q2,  r1
    vld1.16     {d0-d3},   [r0,:128]
    vdup.16     q15, r3
    vmul.s16    q0,  q0,  q2
    vmul.s16    q1,  q1,  q2
    vst1.16     {d0-d3},   [r0,:128]
    bx          lr
 dequant_4x4_dc_rshift:
    vdup.16     d4,  r1
    vdup.32     q15, r3
    rsb         r3,  r3,  #0
    mov         ip,  #1
    sub         r3,  r3,  #1
    lsl         ip,  ip,  r3
    vdup.32     q10, ip
    vdup.32     q11, ip
    vld1.16     {d0-d3},   [r0,:128]
    vdup.32     q12, ip
    vdup.32     q13, ip
    vmlal.s16   q10, d0,  d4
    vmlal.s16   q11, d1,  d4
    vmlal.s16   q12, d2,  d4
    vmlal.s16   q13, d3,  d4
    vshl.s32    q10, q10, q15
    vshl.s32    q11, q11, q15
    vshl.s32    q12, q12, q15
    vshl.s32    q13, q13, q15
    vmovn.s32   d0,  q10
    vmovn.s32   d1,  q11
    vmovn.s32   d2,  q12
    vmovn.s32   d3,  q13
    vst1.16     {d0-d3},   [r0,:128]
    bx          lr
 endfunc
 .macro decimate_score_1x size
 function decimate_score\size\()_neon
    vld1.16     {q0, q1}, [r0, :128]
    movrel      r3, mask_2bit
    vmov.s8     q3,  #0x01
    vqmovn.s16  d0,  q0
    vqmovn.s16  d1,  q1
    vqabs.s8    q2,  q0
    vld1.8      {q8}, [r3, :128]
    vceq.s8     q1,  q0,  #0
    vcgt.s8     q2,  q2,  q3
    vand.u8     q1,  q1,  q8
    vshrn.u16   d4,  q2,  #4
    vpadd.u8    d2,  d2,  d3
    vpadd.u8    d4,  d4,  d4
    vpadd.u8    d2,  d2,  d2
    vmov.32     r2,  d4[0]
    vmov.32     r1,  d2[0]
    cmp         r2,  #0
    beq         0f
    mov         r0,  #9
    bx          lr
 0:
    mvns        r1,  r1
    mov         r0,  #0
    bxeq        lr
 .ifc \size, 15
    lsr         r1,  r1,  #2
 .endif
    rbit        r1,  r1
    movrelx     r3,  X264(decimate_table4), r2
 1:
    clz         r2,  r1
    lsl         r1,  r1,  r2
    lsr         r12, r2,  #1
    ldrb        r2,  [r3, r12]
    lsls        r1,  r1,  #2
    add         r0,  r0,  r2
    bne         1b
    bx          lr
 endfunc
 .endm
 decimate_score_1x 15
 decimate_score_1x 16
 function decimate_score64_neon
    push        {lr}
    vld1.16     {q8,  q9},  [r0, :128]!
    vld1.16     {q10, q11}, [r0, :128]!
    vld1.16     {q12, q13}, [r0, :128]!
    vld1.16     {q14, q15}, [r0, :128]
    movrel      r3, mask_1bit
    vmov.s8     q3,  #0x01
    vqmovn.s16  d17, q8
    vqmovn.s16  d16, q9
    vqmovn.s16  d19, q10
    vqmovn.s16  d18, q11
    vqmovn.s16  d21, q12
    vqmovn.s16  d20, q13
    vqmovn.s16  d23, q14
    vqmovn.s16  d22, q15
    vqabs.s8    q12, q8
    vqabs.s8    q13, q9
    vqabs.s8    q14, q10
    vqabs.s8    q15, q11
    vld1.8      {q2}, [r3, :128]
    vceq.s8     q8,  q8,  #0
    vceq.s8     q9,  q9,  #0
    vceq.s8     q10, q10, #0
    vceq.s8     q11, q11, #0
    vmax.s8     q12, q12, q13
    vmax.s8     q14, q14, q15
    vand.u8     q8,  q8,  q2
    vand.u8     q9,  q9,  q2
    vand.u8     q10, q10, q2
    vand.u8     q11, q11, q2
    vmax.s8     q12, q12, q14
    vpadd.u8    d18, d18, d19
    vpadd.u8    d19, d16, d17
    vcgt.s8     q12, q12, q3
    vpadd.u8    d22, d22, d23
    vpadd.u8    d23, d20, d21
    vshrn.u16   d24, q12, #4
    vpadd.u8    d16, d22, d23
    vpadd.u8    d17, d18, d19
    vpadd.u8    d24, d24, d24
    vpadd.u8    d16, d16, d17
    vmov.32     r2,  d24[0]
    vmov        r12, r1,  d16
    cmp         r2,  #0
    beq         0f
    mov         r0,  #9
    pop         {pc}
 0:
    mvns        r1,  r1
    mvn         r12, r12
    mov         r0,  #0
    mov         lr,  #32
    movrelx     r3,  X264(decimate_table8), r2
    beq         2f
 1:
    clz         r2,  r1
    lsl         r1,  r1,  r2
    sub         lr,  lr,  r2
    ldrb        r2,  [r3, r2]
    lsls        r1,  r1,  #1
    sub         lr,  lr,  #1
    add         r0,  r0,  r2
    bne         1b
 2:
    cmp         r12, #0
    popeq       {pc}
    clz         r2,  r12
    lsl         r1,  r12, r2
    add         r2,  r2,  lr
    ldrb        r2,  [r3, r2]
    lsls        r1,  r1,  #1
    add         r0,  r0,  r2
    popeq       {pc}
 3:
    clz         r2,  r1
    lsl         r1,  r1,  r2
    ldrb        r2,  [r3, r2]
    lsls        r1,  r1,  #1
    add         r0,  r0,  r2
    bne         3b
    pop         {pc}
 endfunc
 // int coeff_last( int16_t *l )
 function coeff_last4_arm
    ldrd        r2,  r3,  [r0]
    subs        r0,  r3,  #0
    movne       r0,  #2
    movne       r2,  r3
    lsrs        r2,  r2,  #16
    addne       r0,  r0,  #1
    bx          lr
 endfunc
 function coeff_last8_arm
    ldrd        r2,  r3,  [r0, #8]
    orrs        ip,  r2,  r3
    movne       r0,  #4
    ldrdeq      r2,  r3,  [r0]
    moveq       r0,  #0
    tst         r3,  r3
    addne       r0,  #2
    movne       r2,  r3
    lsrs        r2,  r2,  #16
    addne       r0,  r0,  #1
    bx          lr
 endfunc
 .macro COEFF_LAST_1x size
 function coeff_last\size\()_neon
 .if \size == 15
    sub         r0,  r0,  #2
 .endif
    vld1.64     {d0-d3}, [r0,:128]
    vtst.16     q0,  q0
    vtst.16     q1,  q1
    vshrn.u16   d0,  q0,  #8
    vshrn.u16   d1,  q1,  #8
    vshrn.u16   d0,  q0,  #4
    vclz.i32    d0,  d0
    mov         ip,  #7
    mov         r3,  #\size - 9
    vmov        r0,  r1,  d0
    subs        r1,  ip,  r1,  lsr #2
    addge       r0,  r1,  #\size - 8
    subslt      r0,  r3,  r0,  lsr #2
    movlt       r0,  #0
    bx          lr
 endfunc
 .endm
 COEFF_LAST_1x 15
 COEFF_LAST_1x 16
 function coeff_last64_neon
    vld1.64     {d16-d19}, [r0,:128]!
    vqmovn.u16  d16, q8
    vqmovn.u16  d17, q9
    vld1.64     {d20-d23}, [r0,:128]!
    vqmovn.u16  d18, q10
    vqmovn.u16  d19, q11
    vld1.64     {d24-d27}, [r0,:128]!
    vqmovn.u16  d20, q12
    vqmovn.u16  d21, q13
    vld1.64     {d28-d31}, [r0,:128]!
    vqmovn.u16  d22, q14
    vqmovn.u16  d23, q15
    movrel      r1, pmovmskb_byte
    vld1.64     {d0-d1}, [r1,:128]
    vtst.8      q8,  q8
    vtst.8      q9,  q9
    vtst.8      q10, q10
    vtst.8      q11, q11
    vand        q8,  q8,  q0
    vand        q9,  q9,  q0
    vand        q10, q10, q0
    vand        q11, q11, q0
    vpadd.u8    d0,  d16, d17
    vpadd.u8    d1,  d18, d19
    vpadd.u8    d2,  d20, d21
    vpadd.u8    d3,  d22, d23
    vpadd.u8    d0,  d0,  d1
    vpadd.u8    d1,  d2,  d3
    vpadd.u8    d0,  d0,  d1
    vclz.i32    d0,  d0
    mov         ip,  #31
    vmov        r0,  r1,  d0
    subs        r1,  ip,  r1
    addge       r0,  r1,  #32
    subslt      r0,  ip,  r0
    movlt       r0,  #0
    bx          lr
 endfunc
 function denoise_dct_neon
 1:  subs        r3,  r3,  #16
    vld1.16     {q0,  q1},  [r0]
    vld1.32     {q12, q13}, [r1]!
    vld1.32     {q14, q15}, [r1]
    sub         r1,  #32
    vabs.s16    q8,  q0
    vabs.s16    q9,  q1
    vld1.16     {q2, q3}, [r2]!
    vclt.s16    q10, q0,  #0
    vclt.s16    q11, q1,  #0
    vaddw.u16   q12, q12, d16
    vaddw.u16   q13, q13, d17
    vqsub.u16   q0,  q8,  q2
    vqsub.u16   q1,  q9,  q3
    vaddw.u16   q14, q14, d18
    vaddw.u16   q15, q15, d19
    vneg.s16    q8,  q0
    vneg.s16    q9,  q1
    vbsl        q10, q8,  q0
    vbsl        q11, q9,  q1
    vst1.32     {q12, q13}, [r1]!
    vst1.32     {q14, q15}, [r1]!
    vst1.16     {q10, q11}, [r0]!
    bgt         1b
    bx          lr
 endfunc
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -0,0 +1,71 @@
 /*****************************************************************************
 * quant.h: arm quantization and level-run
 *****************************************************************************
 * Copyright (C) 2005-2025 x264 project
 *
 * Authors: David Conrad <lessen42@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_ARM_QUANT_H
 #define X264_ARM_QUANT_H
 #define x264_quant_2x2_dc_armv6 x264_template(quant_2x2_dc_armv6)
 int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
 #define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
 int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
 #define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
 int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
 #define x264_quant_4x4_neon x264_template(quant_4x4_neon)
 int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
 #define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
 int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
 #define x264_quant_8x8_neon x264_template(quant_8x8_neon)
 int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
 #define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
 void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
 void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
 void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 #define x264_decimate_score15_neon x264_template(decimate_score15_neon)
 int x264_decimate_score15_neon( int16_t * );
 #define x264_decimate_score16_neon x264_template(decimate_score16_neon)
 int x264_decimate_score16_neon( int16_t * );
 #define x264_decimate_score64_neon x264_template(decimate_score64_neon)
 int x264_decimate_score64_neon( int16_t * );
 #define x264_coeff_last4_arm x264_template(coeff_last4_arm)
 int x264_coeff_last4_arm( int16_t * );
 #define x264_coeff_last8_arm x264_template(coeff_last8_arm)
 int x264_coeff_last8_arm( int16_t * );
 #define x264_coeff_last15_neon x264_template(coeff_last15_neon)
 int x264_coeff_last15_neon( int16_t * );
 #define x264_coeff_last16_neon x264_template(coeff_last16_neon)
 int x264_coeff_last16_neon( int16_t * );
 #define x264_coeff_last64_neon x264_template(coeff_last64_neon)
 int x264_coeff_last64_neon( int16_t * );
 #define x264_denoise_dct_neon x264_template(denoise_dct_neon)
 void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
 #endif
--- a/common/base.c
+++ b/common/base.c
--- a/common/base.h
+++ b/common/base.h
@@ -0,0 +1,339 @@
 /*****************************************************************************
 * base.h: misc common functions (bit depth independent)
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_BASE_H
 #define X264_BASE_H
 /****************************************************************************
 * Macros (can be used in osdep.h)
 ****************************************************************************/
 #define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) )
 #define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) )
 #define X264_MIN3(a,b,c) X264_MIN((a),X264_MIN((b),(c)))
 #define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c)))
 #define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
 #define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
 /****************************************************************************
 * System includes
 ****************************************************************************/
 #include "osdep.h"
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 #include <limits.h>
 /****************************************************************************
 * Macros
 ****************************************************************************/
 #define XCHG(type,a,b) do { type t = a; a = b; b = t; } while( 0 )
 #define FIX8(f) ((int)(f*(1<<8)+.5))
 #define ARRAY_ELEMS(a) ((int)((sizeof(a))/(sizeof(a[0]))))
 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
 #define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
 /* Unions for type-punning.
 * Mn: load or store n bits, aligned, native-endian
 * CPn: copy n bits, aligned, native-endian
 * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
 typedef union { uint16_t i; uint8_t  b[2]; } MAY_ALIAS x264_union16_t;
 typedef union { uint32_t i; uint16_t w[2]; uint8_t  b[4]; } MAY_ALIAS x264_union32_t;
 typedef union { uint64_t i; uint32_t d[2]; uint16_t w[4]; uint8_t b[8]; } MAY_ALIAS x264_union64_t;
 typedef struct { uint64_t i[2]; } x264_uint128_t;
 typedef union { x264_uint128_t i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_t;
 #define M16(src) (((x264_union16_t*)(src))->i)
 #define M32(src) (((x264_union32_t*)(src))->i)
 #define M64(src) (((x264_union64_t*)(src))->i)
 #define M128(src) (((x264_union128_t*)(src))->i)
 #define M128_ZERO ((x264_uint128_t){{0,0}})
 #define CP16(dst,src) M16(dst) = M16(src)
 #define CP32(dst,src) M32(dst) = M32(src)
 #define CP64(dst,src) M64(dst) = M64(src)
 #define CP128(dst,src) M128(dst) = M128(src)
 /* Macros for memory constraints of inline asm */
 #if defined(__GNUC__) && __GNUC__ >= 8 && !defined(__clang__) && !defined(__INTEL_COMPILER)
 #define MEM_FIX(x, t, s) (*(t (*)[s])(x))
 #define MEM_DYN(x, t) (*(t (*)[])(x))
 #else
 //older versions of gcc prefer casting to structure instead of array
 #define MEM_FIX(x, t, s) (*(struct { t a[s]; } MAY_ALIAS (*))(x))
 //let's set an arbitrary large constant size
 #define MEM_DYN(x, t) MEM_FIX(x, t, 4096)
 #endif
 /****************************************************************************
 * Constants
 ****************************************************************************/
 enum profile_e
 {
    PROFILE_BASELINE = 66,
    PROFILE_MAIN     = 77,
    PROFILE_HIGH    = 100,
    PROFILE_HIGH10  = 110,
    PROFILE_HIGH422 = 122,
    PROFILE_HIGH444_PREDICTIVE = 244,
 };
 enum chroma_format_e
 {
    CHROMA_400 = 0,
    CHROMA_420 = 1,
    CHROMA_422 = 2,
    CHROMA_444 = 3,
 };
 enum slice_type_e
 {
    SLICE_TYPE_P  = 0,
    SLICE_TYPE_B  = 1,
    SLICE_TYPE_I  = 2,
 };
 static const char slice_type_to_char[] = { 'P', 'B', 'I' };
 enum sei_payload_type_e
 {
    SEI_BUFFERING_PERIOD       = 0,
    SEI_PIC_TIMING             = 1,
    SEI_PAN_SCAN_RECT          = 2,
    SEI_FILLER                 = 3,
    SEI_USER_DATA_REGISTERED   = 4,
    SEI_USER_DATA_UNREGISTERED = 5,
    SEI_RECOVERY_POINT         = 6,
    SEI_DEC_REF_PIC_MARKING    = 7,
    SEI_FRAME_PACKING          = 45,
    SEI_MASTERING_DISPLAY      = 137,
    SEI_CONTENT_LIGHT_LEVEL    = 144,
    SEI_ALTERNATIVE_TRANSFER   = 147,
 };
 #define X264_BFRAME_MAX 16
 #define X264_REF_MAX 16
 #define X264_THREAD_MAX 128
 #define X264_LOOKAHEAD_THREAD_MAX 16
 #define X264_LOOKAHEAD_MAX 250
 // number of pixels (per thread) in progress at any given time.
 // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
 #define X264_THREAD_HEIGHT 24
 /* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
 * (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
 * to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
 * real weights are being used. */
 #define X264_WEIGHTP_FAKE (-1)
 #define X264_SCAN8_LUMA_SIZE (5*8)
 #define X264_SCAN8_SIZE (X264_SCAN8_LUMA_SIZE*3)
 #define X264_SCAN8_0 (4+1*8)
 /* Scan8 organization:
 *    0 1 2 3 4 5 6 7
 * 0  DY    y y y y y
 * 1        y Y Y Y Y
 * 2        y Y Y Y Y
 * 3        y Y Y Y Y
 * 4        y Y Y Y Y
 * 5  DU    u u u u u
 * 6        u U U U U
 * 7        u U U U U
 * 8        u U U U U
 * 9        u U U U U
 * 10 DV    v v v v v
 * 11       v V V V V
 * 12       v V V V V
 * 13       v V V V V
 * 14       v V V V V
 * DY/DU/DV are for luma/chroma DC.
 */
 #define LUMA_DC   48
 #define CHROMA_DC 49
 static const uint8_t x264_scan8[16*3 + 3] =
 {
    4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
    6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
    4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
    6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
    4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
    6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
    4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
    6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
    4+11*8, 5+11*8, 4+12*8, 5+12*8,
    6+11*8, 7+11*8, 6+12*8, 7+12*8,
    4+13*8, 5+13*8, 4+14*8, 5+14*8,
    6+13*8, 7+13*8, 6+14*8, 7+14*8,
    0+ 0*8, 0+ 5*8, 0+10*8
 };
 /****************************************************************************
 * Includes
 ****************************************************************************/
 #include "cpu.h"
 #include "tables.h"
 /****************************************************************************
 * Inline functions
 ****************************************************************************/
 static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
 {
    return ( (v < i_min) ? i_min : (v > i_max) ? i_max : v );
 }
 static ALWAYS_INLINE double x264_clip3f( double v, double f_min, double f_max )
 {
    return ( (v < f_min) ? f_min : (v > f_max) ? f_max : v );
 }
 /* Not a general-purpose function; multiplies input by -1/6 to convert
 * qp to qscale. */
 static ALWAYS_INLINE int x264_exp2fix8( float x )
 {
    int i = x*(-64.f/6.f) + 512.5f;
    if( i < 0 ) return 0;
    if( i > 1023 ) return 0xffff;
    return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
 }
 static ALWAYS_INLINE float x264_log2( uint32_t x )
 {
    int lz = x264_clz( x );
    return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
 }
 static ALWAYS_INLINE int x264_median( int a, int b, int c )
 {
    int t = (a-b)&((a-b)>>31);
    a -= t;
    b += t;
    b -= (b-c)&((b-c)>>31);
    b += (a-b)&((a-b)>>31);
    return b;
 }
 static ALWAYS_INLINE void x264_median_mv( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
 {
    dst[0] = x264_median( a[0], b[0], c[0] );
    dst[1] = x264_median( a[1], b[1], c[1] );
 }
 static ALWAYS_INLINE int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
 {
    int sum = 0;
    for( int i = 0; i < i_mvc-1; i++ )
    {
        sum += abs( mvc[i][0] - mvc[i+1][0] )
             + abs( mvc[i][1] - mvc[i+1][1] );
    }
    return sum;
 }
 static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
 {
    int amvd0 = mvdleft[0] + mvdtop[0];
    int amvd1 = mvdleft[1] + mvdtop[1];
    amvd0 = (amvd0 > 2) + (amvd0 > 32);
    amvd1 = (amvd1 > 2) + (amvd1 > 32);
    return amvd0 + (amvd1<<8);
 }
 /****************************************************************************
 * General functions
 ****************************************************************************/
 X264_API void x264_reduce_fraction( uint32_t *n, uint32_t *d );
 X264_API void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
 X264_API void x264_log_default( void *p_unused, int i_level, const char *psz_fmt, va_list arg );
 X264_API void x264_log_internal( int i_level, const char *psz_fmt, ... );
 /* x264_malloc: will do or emulate a memalign
 * you have to use x264_free for buffers allocated with x264_malloc */
 X264_API void *x264_malloc( int64_t );
 X264_API void  x264_free( void * );
 /* x264_slurp_file: malloc space for the whole file and read it */
 X264_API char *x264_slurp_file( const char *filename );
 /* x264_param_strdup: will do strdup and save returned pointer inside
 * x264_param_t for later freeing during x264_param_cleanup */
 char *x264_param_strdup( x264_param_t *param, const char *src );
 /* x264_param2string: return a (malloced) string containing most of
 * the encoding options */
 X264_API char *x264_param2string( x264_param_t *p, int b_res );
 /****************************************************************************
 * Macros
 ****************************************************************************/
 #define CHECKED_MALLOC( var, size )\
 do {\
    var = x264_malloc( size );\
    if( !var )\
        goto fail;\
 } while( 0 )
 #define CHECKED_MALLOCZERO( var, size )\
 do {\
    CHECKED_MALLOC( var, size );\
    memset( var, 0, size );\
 } while( 0 )
 #define CHECKED_PARAM_STRDUP( var, param, src )\
 do {\
    var = x264_param_strdup( param, src );\
    if( !var )\
        goto fail;\
 } while( 0 )
 /* Macros for merging multiple allocations into a single large malloc, for improved
 * use with huge pages. */
 /* Needs to be enough to contain any set of buffers that use combined allocations */
 #define PREALLOC_BUF_SIZE 1024
 #define PREALLOC_INIT\
    int    prealloc_idx = 0;\
    int64_t prealloc_size = 0;\
    uint8_t **preallocs[PREALLOC_BUF_SIZE];
 #define PREALLOC( var, size )\
 do {\
    var = (void*)(intptr_t)prealloc_size;\
    preallocs[prealloc_idx++] = (uint8_t**)&var;\
    prealloc_size += ALIGN((int64_t)(size), NATIVE_ALIGN);\
 } while( 0 )
 #define PREALLOC_END( ptr )\
 do {\
    CHECKED_MALLOC( ptr, prealloc_size );\
    while( prealloc_idx-- )\
        *preallocs[prealloc_idx] = (uint8_t*)((intptr_t)(*preallocs[prealloc_idx]) + (intptr_t)ptr);\
 } while( 0 )
 #endif
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -0,0 +1,166 @@
 /*****************************************************************************
 * bitstream.c: bitstream writing
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Fiona Glaser <fiona@x264.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common.h"
 static uint8_t *nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
 {
    if( src < end ) *dst++ = *src++;
    if( src < end ) *dst++ = *src++;
    while( src < end )
    {
        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
            *dst++ = 0x03;
        *dst++ = *src++;
    }
    return dst;
 }
 #if HAVE_MMX
 #include "x86/bitstream.h"
 #endif
 #if HAVE_ARMV6
 #include "arm/bitstream.h"
 #endif
 #if HAVE_AARCH64
 #include "aarch64/bitstream.h"
 #endif
 /****************************************************************************
 * x264_nal_encode:
 ****************************************************************************/
 void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
 {
    uint8_t *src = nal->p_payload;
    uint8_t *end = nal->p_payload + nal->i_payload;
    uint8_t *orig_dst = dst;
    if( h->param.b_annexb )
    {
        if( nal->b_long_startcode )
            *dst++ = 0x00;
        *dst++ = 0x00;
        *dst++ = 0x00;
        *dst++ = 0x01;
    }
    else /* save room for size later */
        dst += 4;
    /* nal header */
    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
    dst = h->bsf.nal_escape( dst, src, end );
    int size = dst - orig_dst;
    /* Apply AVC-Intra padding */
    if( h->param.i_avcintra_class )
    {
        int padding = nal->i_payload + nal->i_padding + NALU_OVERHEAD - size;
        if( padding > 0 )
        {
            memset( dst, 0, padding );
            size += padding;
        }
        nal->i_padding = X264_MAX( padding, 0 );
    }
    /* Write the size header for mp4/etc */
    if( !h->param.b_annexb )
    {
        /* Size doesn't include the size of the header we're writing now. */
        int chunk_size = size - 4;
        orig_dst[0] = (uint8_t)(chunk_size >> 24);
        orig_dst[1] = (uint8_t)(chunk_size >> 16);
        orig_dst[2] = (uint8_t)(chunk_size >> 8);
        orig_dst[3] = (uint8_t)(chunk_size >> 0);
    }
    nal->i_payload = size;
    nal->p_payload = orig_dst;
    x264_emms();
 }
 void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf )
 {
    memset( pf, 0, sizeof(*pf) );
    pf->nal_escape = nal_escape_c;
 #if HAVE_MMX
 #if ARCH_X86_64
    pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
    pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
    pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
 #endif
    if( cpu&X264_CPU_MMX2 )
        pf->nal_escape = x264_nal_escape_mmx2;
    if( cpu&X264_CPU_SSE2 )
    {
        if( cpu&X264_CPU_SSE2_IS_FAST )
            pf->nal_escape = x264_nal_escape_sse2;
    }
 #if ARCH_X86_64
    if( cpu&X264_CPU_LZCNT )
    {
        pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
        pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt;
        pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt;
    }
    if( cpu&X264_CPU_SSSE3 )
    {
        pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
        pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3;
        if( cpu&X264_CPU_LZCNT )
        {
            pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt;
            pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
        }
    }
    if( cpu&X264_CPU_AVX2 )
    {
        pf->nal_escape = x264_nal_escape_avx2;
        pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
    }
    if( cpu&X264_CPU_AVX512 )
    {
        pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
        pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
        pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
    }
 #endif
 #endif
 #if HAVE_ARMV6
    if( cpu&X264_CPU_NEON )
        pf->nal_escape = x264_nal_escape_neon;
 #endif
 #if HAVE_AARCH64
    if( cpu&X264_CPU_NEON )
        pf->nal_escape = x264_nal_escape_neon;
 #endif
 }
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -0,0 +1,309 @@
 /*****************************************************************************
 * bitstream.h: bitstream writing
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Fiona Glaser <fiona@x264.com>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_BS_H
 #define X264_BS_H
 typedef struct
 {
    uint16_t i_bits;
    uint8_t  i_size;
    /* Next level table to use */
    uint8_t  i_next;
 } vlc_large_t;
 typedef struct bs_s
 {
    uint8_t *p_start;
    uint8_t *p;
    uint8_t *p_end;
    uintptr_t cur_bits;
    int     i_left;    /* i_count number of available bits */
    int     i_bits_encoded; /* RD only */
 } bs_t;
 typedef struct
 {
    int32_t last;
    int32_t mask;
    ALIGNED_16( dctcoef level[18] );
 } x264_run_level_t;
 typedef struct
 {
    uint8_t *(*nal_escape)( uint8_t *dst, uint8_t *src, uint8_t *end );
    void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced,
                                           intptr_t ctx_block_cat, x264_cabac_t *cb );
    void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced,
                                              intptr_t ctx_block_cat, x264_cabac_t *cb );
    void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced,
                                                  intptr_t ctx_block_cat, x264_cabac_t *cb );
 } x264_bitstream_function_t;
 #define x264_bitstream_init x264_template(bitstream_init)
 void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf );
 /* A larger level table size theoretically could help a bit at extremely
 * high bitrates, but the cost in cache is usually too high for it to be
 * useful.
 * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
 * FIXME: Do further testing? */
 #define LEVEL_TABLE_SIZE 128
 #define x264_level_token x264_template(level_token)
 extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
 /* The longest possible set of zero run codes sums to 25 bits.  This leaves
 * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
 #define x264_run_before x264_template(run_before)
 extern uint32_t x264_run_before[1<<16];
 static inline void bs_init( bs_t *s, void *p_data, int i_data )
 {
    int offset = ((intptr_t)p_data & 3);
    s->p       = s->p_start = (uint8_t*)p_data - offset;
    s->p_end   = (uint8_t*)p_data + i_data;
    s->i_left  = (WORD_SIZE - offset)*8;
    if( offset )
    {
        s->cur_bits = endian_fix32( M32(s->p) );
        s->cur_bits >>= (4-offset)*8;
    }
    else
        s->cur_bits = 0;
 }
 static inline int bs_pos( bs_t *s )
 {
    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
 }
 /* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
 static inline void bs_flush( bs_t *s )
 {
    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
    s->p += WORD_SIZE - (s->i_left >> 3);
    s->i_left = WORD_SIZE*8;
 }
 /* The inverse of bs_flush: prepare the bitstream to be written to again. */
 static inline void bs_realign( bs_t *s )
 {
    int offset = ((intptr_t)s->p & 3);
    if( offset )
    {
        s->p       = (uint8_t*)s->p - offset;
        s->i_left  = (WORD_SIZE - offset)*8;
        s->cur_bits = endian_fix32( M32(s->p) );
        s->cur_bits >>= (4-offset)*8;
    }
 }
 static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
 {
    if( WORD_SIZE == 8 )
    {
        s->cur_bits = (s->cur_bits << i_count) | i_bits;
        s->i_left -= i_count;
        if( s->i_left <= 32 )
        {
 #if WORDS_BIGENDIAN
            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
 #else
            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
 #endif
            s->i_left += 32;
            s->p += 4;
        }
    }
    else
    {
        if( i_count < s->i_left )
        {
            s->cur_bits = (s->cur_bits << i_count) | i_bits;
            s->i_left -= i_count;
        }
        else
        {
            i_count -= s->i_left;
            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
            M32( s->p ) = endian_fix( s->cur_bits );
            s->p += 4;
            s->cur_bits = i_bits;
            s->i_left = 32 - i_count;
        }
    }
 }
 /* Special case to eliminate branch in normal bs_write. */
 /* Golomb never writes an even-size code, so this is only used in slice headers. */
 static inline void bs_write32( bs_t *s, uint32_t i_bits )
 {
    bs_write( s, 16, i_bits >> 16 );
    bs_write( s, 16, i_bits );
 }
 static inline void bs_write1( bs_t *s, uint32_t i_bit )
 {
    s->cur_bits <<= 1;
    s->cur_bits |= i_bit;
    s->i_left--;
    if( s->i_left == WORD_SIZE*8-32 )
    {
        M32( s->p ) = endian_fix32( s->cur_bits );
        s->p += 4;
        s->i_left = WORD_SIZE*8;
    }
 }
 static inline void bs_align_0( bs_t *s )
 {
    bs_write( s, s->i_left&7, 0 );
    bs_flush( s );
 }
 static inline void bs_align_1( bs_t *s )
 {
    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
    bs_flush( s );
 }
 static inline void bs_align_10( bs_t *s )
 {
    if( s->i_left&7 )
        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
    bs_flush( s );
 }
 /* golomb functions */
 static const uint8_t x264_ue_size_tab[256] =
 {
     1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
     9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
 };
 static inline void bs_write_ue_big( bs_t *s, unsigned int val )
 {
    int size = 0;
    int tmp = ++val;
    if( tmp >= 0x10000 )
    {
        size = 32;
        tmp >>= 16;
    }
    if( tmp >= 0x100 )
    {
        size += 16;
        tmp >>= 8;
    }
    size += x264_ue_size_tab[tmp];
    bs_write( s, size>>1, 0 );
    bs_write( s, (size>>1)+1, val );
 }
 /* Only works on values under 255. */
 static inline void bs_write_ue( bs_t *s, int val )
 {
    bs_write( s, x264_ue_size_tab[val+1], val+1 );
 }
 static inline void bs_write_se( bs_t *s, int val )
 {
    int size = 0;
    /* Faster than (val <= 0 ? -val*2+1 : val*2) */
    /* 4 instructions on x86, 3 on ARM */
    int tmp = 1 - val*2;
    if( tmp < 0 ) tmp = val*2;
    val = tmp;
    if( tmp >= 0x100 )
    {
        size = 16;
        tmp >>= 8;
    }
    size += x264_ue_size_tab[tmp];
    bs_write( s, size, val );
 }
 static inline void bs_write_te( bs_t *s, int x, int val )
 {
    if( x == 1 )
        bs_write1( s, 1^val );
    else //if( x > 1 )
        bs_write_ue( s, val );
 }
 static inline void bs_rbsp_trailing( bs_t *s )
 {
    bs_write1( s, 1 );
    bs_write( s, s->i_left&7, 0  );
 }
 static ALWAYS_INLINE int bs_size_ue( unsigned int val )
 {
    return x264_ue_size_tab[val+1];
 }
 static ALWAYS_INLINE int bs_size_ue_big( unsigned int val )
 {
    if( val < 255 )
        return x264_ue_size_tab[val+1];
    else
        return x264_ue_size_tab[(val+1)>>8] + 16;
 }
 static ALWAYS_INLINE int bs_size_se( int val )
 {
    int tmp = 1 - val*2;
    if( tmp < 0 ) tmp = val*2;
    if( tmp < 256 )
        return x264_ue_size_tab[tmp];
    else
        return x264_ue_size_tab[tmp>>8]+16;
 }
 static ALWAYS_INLINE int bs_size_te( int x, int val )
 {
    if( x == 1 )
        return 1;
    else //if( x > 1 )
        return x264_ue_size_tab[val+1];
 }
 #endif
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -0,0 +1,184 @@
 /*****************************************************************************
 * cabac.c: arithmetic coder
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Fiona Glaser <fiona@x264.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common.h"
 static uint8_t cabac_contexts[4][QP_MAX_SPEC+1][1024];
 void x264_cabac_init( x264_t *h )
 {
    int ctx_count = CHROMA444 ? 1024 : 460;
    for( int i = 0; i < 4; i++ )
    {
        const int8_t (*cabac_context_init)[1024][2] = i == 0 ? &x264_cabac_context_init_I
                                                             : &x264_cabac_context_init_PB[i-1];
        for( int qp = 0; qp <= QP_MAX_SPEC; qp++ )
            for( int j = 0; j < ctx_count; j++ )
            {
                int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 );
                cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
            }
    }
 }
 void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
 {
    memcpy( cb->state, cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], CHROMA444 ? 1024 : 460 );
 }
 void x264_cabac_encode_init_core( x264_cabac_t *cb )
 {
    cb->i_low   = 0;
    cb->i_range = 0x01FE;
    cb->i_queue = -9; // the first bit will be shifted away and not written
    cb->i_bytes_outstanding = 0;
 }
 void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end )
 {
    x264_cabac_encode_init_core( cb );
    cb->p_start = p_data;
    cb->p       = p_data;
    cb->p_end   = p_end;
 }
 static inline void cabac_putbyte( x264_cabac_t *cb )
 {
    if( cb->i_queue >= 0 )
    {
        int out = cb->i_low >> (cb->i_queue+10);
        cb->i_low &= (0x400<<cb->i_queue)-1;
        cb->i_queue -= 8;
        if( (out & 0xff) == 0xff )
            cb->i_bytes_outstanding++;
        else
        {
            int carry = out >> 8;
            int bytes_outstanding = cb->i_bytes_outstanding;
            // this can't modify before the beginning of the stream because
            // that would correspond to a probability > 1.
            // it will write before the beginning of the stream, which is ok
            // because a slice header always comes before cabac data.
            // this can't carry beyond the one byte, because any 0xff bytes
            // are in bytes_outstanding and thus not written yet.
            cb->p[-1] += carry;
            while( bytes_outstanding > 0 )
            {
                *(cb->p++) = (uint8_t)(carry-1);
                bytes_outstanding--;
            }
            *(cb->p++) = (uint8_t)out;
            cb->i_bytes_outstanding = 0;
        }
    }
 }
 static inline void cabac_encode_renorm( x264_cabac_t *cb )
 {
    int shift = x264_cabac_renorm_shift[cb->i_range>>3];
    cb->i_range <<= shift;
    cb->i_low   <<= shift;
    cb->i_queue  += shift;
    cabac_putbyte( cb );
 }
 /* Making custom versions of this function, even in asm, for the cases where
 * b is known to be 0 or 1, proved to be somewhat useful on x86_32 with GCC 3.4
 * but nearly useless with GCC 4.3 and worse than useless on x86_64. */
 void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b )
 {
    int i_state = cb->state[i_ctx];
    int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4];
    cb->i_range -= i_range_lps;
    if( b != (i_state & 1) )
    {
        cb->i_low += cb->i_range;
        cb->i_range = i_range_lps;
    }
    cb->state[i_ctx] = x264_cabac_transition[i_state][b];
    cabac_encode_renorm( cb );
 }
 /* Note: b is negated for this function */
 void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b )
 {
    cb->i_low <<= 1;
    cb->i_low += b & cb->i_range;
    cb->i_queue += 1;
    cabac_putbyte( cb );
 }
 static const int bypass_lut[16] =
 {
    -1,      0x2,     0x14,     0x68,     0x1d0,     0x7a0,     0x1f40,     0x7e80,
    0x1fd00, 0x7fa00, 0x1ff400, 0x7fe800, 0x1ffd000, 0x7ffa000, 0x1fff4000, 0x7ffe8000
 };
 void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val )
 {
    uint32_t v = val + (1<<exp_bits);
    int k = 31 - x264_clz( v );
    uint32_t x = ((uint32_t)bypass_lut[k-exp_bits]<<exp_bits) + v;
    k = 2*k+1-exp_bits;
    int i = ((k-1)&7)+1;
    do {
        k -= i;
        cb->i_low <<= i;
        cb->i_low += ((x>>k)&0xff) * cb->i_range;
        cb->i_queue += i;
        cabac_putbyte( cb );
        i = 8;
    } while( k > 0 );
 }
 void x264_cabac_encode_terminal_c( x264_cabac_t *cb )
 {
    cb->i_range -= 2;
    cabac_encode_renorm( cb );
 }
 void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb )
 {
    cb->i_low += cb->i_range - 2;
    cb->i_low |= 1;
    cb->i_low <<= 9;
    cb->i_queue += 9;
    cabac_putbyte( cb );
    cabac_putbyte( cb );
    cb->i_low <<= -cb->i_queue;
    cb->i_low |= (0x35a4e4f5 >> (h->i_frame & 31) & 1) << 10;
    cb->i_queue = 0;
    cabac_putbyte( cb );
    while( cb->i_bytes_outstanding > 0 )
    {
        *(cb->p++) = 0xff;
        cb->i_bytes_outstanding--;
    }
 }
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -0,0 +1,126 @@
 /*****************************************************************************
 * cabac.h: arithmetic coder
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_CABAC_H
 #define X264_CABAC_H
 typedef struct
 {
    /* state */
    int i_low;
    int i_range;
    /* bit stream */
    int i_queue; //stored with an offset of -8 for faster asm
    int i_bytes_outstanding;
    uint8_t *p_start;
    uint8_t *p;
    uint8_t *p_end;
    /* aligned for memcpy_aligned starting here */
    ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
    /* context */
    uint8_t state[1024];
    /* for 16-byte alignment */
    uint8_t padding[12];
 } x264_cabac_t;
 /* init the contexts given i_slice_type, the quantif and the model */
 #define x264_cabac_context_init x264_template(cabac_context_init)
 void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
 #define x264_cabac_encode_init_core x264_template(cabac_encode_init_core)
 void x264_cabac_encode_init_core( x264_cabac_t *cb );
 #define x264_cabac_encode_init x264_template(cabac_encode_init)
 void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
 #define x264_cabac_encode_decision_c x264_template(cabac_encode_decision_c)
 void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b );
 #define x264_cabac_encode_decision_asm x264_template(cabac_encode_decision_asm)
 void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b );
 #define x264_cabac_encode_bypass_c x264_template(cabac_encode_bypass_c)
 void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b );
 #define x264_cabac_encode_bypass_asm x264_template(cabac_encode_bypass_asm)
 void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b );
 #define x264_cabac_encode_terminal_c x264_template(cabac_encode_terminal_c)
 void x264_cabac_encode_terminal_c( x264_cabac_t *cb );
 #define x264_cabac_encode_terminal_asm x264_template(cabac_encode_terminal_asm)
 void x264_cabac_encode_terminal_asm( x264_cabac_t *cb );
 #define x264_cabac_encode_ue_bypass x264_template(cabac_encode_ue_bypass)
 void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val );
 #define x264_cabac_encode_flush x264_template(cabac_encode_flush)
 void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb );
 #if HAVE_MMX
 #define x264_cabac_encode_decision x264_cabac_encode_decision_asm
 #define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
 #define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
 #elif HAVE_AARCH64
 #define x264_cabac_encode_decision x264_cabac_encode_decision_asm
 #define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm
 #define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm
 #else
 #define x264_cabac_encode_decision x264_cabac_encode_decision_c
 #define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
 #define x264_cabac_encode_terminal x264_cabac_encode_terminal_c
 #endif
 #define x264_cabac_encode_decision_noup x264_cabac_encode_decision
 static ALWAYS_INLINE int x264_cabac_pos( x264_cabac_t *cb )
 {
    return (cb->p - cb->p_start + cb->i_bytes_outstanding) * 8 + cb->i_queue;
 }
 /* internal only. these don't write the bitstream, just calculate bit cost: */
 static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx, long b )
 {
    int i_state = cb->state[i_ctx];
    cb->state[i_ctx] = x264_cabac_transition[i_state][b];
    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
 }
 static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b )
 {
    int i_state = *state;
    *state = x264_cabac_transition[i_state][b];
    return x264_cabac_entropy[i_state^b];
 }
 static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b )
 {
    int i_state = cb->state[i_ctx];
    cb->f8_bits_encoded += x264_cabac_entropy[i_state^b];
 }
 static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b )
 {
    return x264_cabac_entropy[*state^b];
 }
 #endif
--- a/common/common.c
+++ b/common/common.c
@@ -0,0 +1,44 @@
 /*****************************************************************************
 * common.c: misc common functions
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common.h"
 /****************************************************************************
 * x264_log:
 ****************************************************************************/
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
 {
    if( !h || i_level <= h->param.i_log_level )
    {
        va_list arg;
        va_start( arg, psz_fmt );
        if( !h )
            x264_log_default( NULL, i_level, psz_fmt, arg );
        else
            h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
        va_end( arg );
    }
 }
--- a/common/common.h
+++ b/common/common.h
@@ -0,0 +1,813 @@
 /*****************************************************************************
 * common.h: misc common functions
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_COMMON_H
 #define X264_COMMON_H
 #include "base.h"
 /* Macros for templating function calls according to bit depth */
 #define x264_template(w) x264_glue3(x264, BIT_DEPTH, w)
 /****************************************************************************
 * API Templates
 ****************************************************************************/
 #define x264_nal_encode x264_template(nal_encode)
 #define x264_encoder_reconfig x264_template(encoder_reconfig)
 #define x264_encoder_parameters x264_template(encoder_parameters)
 #define x264_encoder_headers x264_template(encoder_headers)
 #define x264_encoder_encode x264_template(encoder_encode)
 #define x264_encoder_close x264_template(encoder_close)
 #define x264_encoder_delayed_frames x264_template(encoder_delayed_frames)
 #define x264_encoder_maximum_delayed_frames x264_template(encoder_maximum_delayed_frames)
 #define x264_encoder_intra_refresh x264_template(encoder_intra_refresh)
 #define x264_encoder_invalidate_reference x264_template(encoder_invalidate_reference)
 /* This undef allows to rename the external symbol and force link failure in case
 * of incompatible libraries. Then the define enables templating as above. */
 #undef x264_encoder_open
 #define x264_encoder_open x264_template(encoder_open)
 /****************************************************************************
 * Macros
 ****************************************************************************/
 #define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
 #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
 #define QP_MAX_SPEC (51+QP_BD_OFFSET)
 #define QP_MAX (QP_MAX_SPEC+18)
 #define PIXEL_MAX ((1 << BIT_DEPTH)-1)
 // arbitrary, but low because SATD scores are 1/4 normal
 #define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
 #define SPEC_QP(x) X264_MIN((x), QP_MAX_SPEC)
 #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
 #define FILLER_OVERHEAD (NALU_OVERHEAD+1)
 #define SEI_OVERHEAD (NALU_OVERHEAD - (h->param.b_annexb && !h->param.i_avcintra_class && (h->out.i_nal-1)))
 #if HAVE_INTERLACED
 #   define MB_INTERLACED h->mb.b_interlaced
 #   define SLICE_MBAFF h->sh.b_mbaff
 #   define PARAM_INTERLACED h->param.b_interlaced
 #else
 #   define MB_INTERLACED 0
 #   define SLICE_MBAFF 0
 #   define PARAM_INTERLACED 0
 #endif
 #ifdef CHROMA_FORMAT
 #    define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422)
 #    define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420)
 #else
 #    define CHROMA_FORMAT h->sps->i_chroma_format_idc
 #    define CHROMA_H_SHIFT h->mb.chroma_h_shift
 #    define CHROMA_V_SHIFT h->mb.chroma_v_shift
 #endif
 #define CHROMA_SIZE(s) (CHROMA_FORMAT ? (s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT) : 0)
 #define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
 #define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
 #if HIGH_BIT_DEPTH
    typedef uint16_t pixel;
    typedef uint64_t pixel4;
    typedef int32_t  dctcoef;
    typedef uint32_t udctcoef;
 #   define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
 #   define MPIXEL_X4(src) M64(src)
 #else
    typedef uint8_t  pixel;
    typedef uint32_t pixel4;
    typedef int16_t  dctcoef;
    typedef uint16_t udctcoef;
 #   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
 #   define MPIXEL_X4(src) M32(src)
 #endif
 #define SIZEOF_PIXEL ((int)sizeof(pixel))
 #define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
 /****************************************************************************
 * Includes
 ****************************************************************************/
 #if HAVE_OPENCL
 #include "opencl.h"
 #endif
 #include "cabac.h"
 #include "bitstream.h"
 #include "set.h"
 #include "predict.h"
 #include "pixel.h"
 #include "mc.h"
 #include "frame.h"
 #include "dct.h"
 #include "quant.h"
 #include "threadpool.h"
 /****************************************************************************
 * General functions
 ****************************************************************************/
 /* log */
 #define x264_log x264_template(log)
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
 #define x264_cavlc_init x264_template(cavlc_init)
 void x264_cavlc_init( x264_t *h );
 #define x264_cabac_init x264_template(cabac_init)
 void x264_cabac_init( x264_t *h );
 static ALWAYS_INLINE pixel x264_clip_pixel( int x )
 {
    return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
 }
 /****************************************************************************
 *
 ****************************************************************************/
 typedef struct
 {
    x264_sps_t *sps;
    x264_pps_t *pps;
    int i_type;
    int i_first_mb;
    int i_last_mb;
    int i_pps_id;
    int i_frame_num;
    int b_mbaff;
    int b_field_pic;
    int b_bottom_field;
    int i_idr_pic_id;   /* -1 if nal_type != 5 */
    int i_poc;
    int i_delta_poc_bottom;
    int i_delta_poc[2];
    int i_redundant_pic_cnt;
    int b_direct_spatial_mv_pred;
    int b_num_ref_idx_override;
    int i_num_ref_idx_l0_active;
    int i_num_ref_idx_l1_active;
    int b_ref_pic_list_reordering[2];
    struct
    {
        int idc;
        int arg;
    } ref_pic_list_order[2][X264_REF_MAX];
    /* P-frame weighting */
    int b_weighted_pred;
    x264_weight_t weight[X264_REF_MAX*2][3];
    int i_mmco_remove_from_end;
    int i_mmco_command_count;
    struct /* struct for future expansion */
    {
        int i_difference_of_pic_nums;
        int i_poc;
    } mmco[X264_REF_MAX];
    int i_cabac_init_idc;
    int i_qp;
    int i_qp_delta;
    int b_sp_for_swidth;
    int i_qs_delta;
    /* deblocking filter */
    int i_disable_deblocking_filter_idc;
    int i_alpha_c0_offset;
    int i_beta_offset;
 } x264_slice_header_t;
 typedef struct x264_lookahead_t
 {
    volatile uint8_t              b_exit_thread;
    uint8_t                       b_thread_active;
    uint8_t                       b_analyse_keyframe;
    int                           i_last_keyframe;
    int                           i_slicetype_length;
    x264_frame_t                  *last_nonb;
    x264_pthread_t                thread_handle;
    x264_sync_frame_list_t        ifbuf;
    x264_sync_frame_list_t        next;
    x264_sync_frame_list_t        ofbuf;
 } x264_lookahead_t;
 typedef struct x264_ratecontrol_t   x264_ratecontrol_t;
 typedef struct x264_left_table_t
 {
    uint8_t intra[4];
    uint8_t nnz[4];
    uint8_t nnz_chroma[4];
    uint8_t mv[4];
    uint8_t ref[4];
 } x264_left_table_t;
 /* Current frame stats */
 typedef struct
 {
    /* MV bits (MV+Ref+Block Type) */
    int i_mv_bits;
    /* Texture bits (DCT coefs) */
    int i_tex_bits;
    /* ? */
    int i_misc_bits;
    /* MB type counts */
    int i_mb_count[19];
    int i_mb_count_i;
    int i_mb_count_p;
    int i_mb_count_skip;
    int i_mb_count_8x8dct[2];
    int i_mb_count_ref[2][X264_REF_MAX*2];
    int i_mb_partition[17];
    int i_mb_cbp[6];
    int i_mb_pred_mode[4][13];
    int i_mb_field[3];
    /* Adaptive direct mv pred */
    int i_direct_score[2];
    /* Metrics */
    int64_t i_ssd[3];
    double f_ssim;
    int i_ssim_cnt;
 } x264_frame_stat_t;
 struct x264_t
 {
    /* encoder parameters */
    x264_param_t    param;
    /* opaque pointer to bit depth independent interface */
    void            *api;
    x264_t          *thread[X264_THREAD_MAX+1];
    x264_t          *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
    int             b_thread_active;
    int             i_thread_phase; /* which thread to use for the next frame */
    int             i_thread_idx;   /* which thread this is */
    int             i_threadslice_start; /* first row in this thread slice */
    int             i_threadslice_end; /* row after the end of this thread slice */
    int             i_threadslice_pass; /* which pass of encoding we are on */
    x264_threadpool_t *threadpool;
    x264_threadpool_t *lookaheadpool;
    x264_pthread_mutex_t mutex;
    x264_pthread_cond_t cv;
    /* bitstream output */
    struct
    {
        int         i_nal;
        int         i_nals_allocated;
        x264_nal_t  *nal;
        int         i_bitstream;    /* size of p_bitstream */
        uint8_t     *p_bitstream;   /* will hold data for all nal */
        bs_t        bs;
    } out;
    uint8_t *nal_buffer;
    int      nal_buffer_size;
    x264_t          *reconfig_h;
    int             reconfig;
    /**** thread synchronization starts here ****/
    /* frame number/poc */
    int             i_frame;
    int             i_frame_num;
    int             i_thread_frames; /* Number of different frames being encoded by threads;
                                      * 1 when sliced-threads is on. */
    int             i_nal_type;
    int             i_nal_ref_idc;
    int64_t         i_disp_fields;  /* Number of displayed fields (both coded and implied via pic_struct) */
    int             i_disp_fields_last_frame;
    int64_t         i_prev_duration; /* Duration of previous frame */
    int64_t         i_coded_fields; /* Number of coded fields (both coded and implied via pic_struct) */
    int64_t         i_cpb_delay;    /* Equal to number of fields preceding this field
                                     * since last buffering_period SEI */
    int64_t         i_coded_fields_lookahead; /* Use separate counters for lookahead */
    int64_t         i_cpb_delay_lookahead;
    int64_t         i_cpb_delay_pir_offset;
    int64_t         i_cpb_delay_pir_offset_next;
    int             b_queued_intra_refresh;
    int64_t         i_last_idr_pts;
    int             i_idr_pic_id;
    /* quantization matrix for decoding, [cqm][qp%6][coef] */
    int             (*dequant4_mf[4])[16];   /* [4][6][16] */
    int             (*dequant8_mf[4])[64];   /* [4][6][64] */
    /* quantization matrix for trellis, [cqm][qp][coef] */
    int             (*unquant4_mf[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
    int             (*unquant8_mf[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
    /* quantization matrix for deadzone */
    udctcoef        (*quant4_mf[4])[16];     /* [4][QP_MAX_SPEC+1][16] */
    udctcoef        (*quant8_mf[4])[64];     /* [4][QP_MAX_SPEC+1][64] */
    udctcoef        (*quant4_bias[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
    udctcoef        (*quant8_bias[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
    udctcoef        (*quant4_bias0[4])[16];  /* [4][QP_MAX_SPEC+1][16] */
    udctcoef        (*quant8_bias0[4])[64];  /* [4][QP_MAX_SPEC+1][64] */
    udctcoef        (*nr_offset_emergency)[4][64];
    /* mv/ref/mode cost arrays. */
    uint16_t *cost_mv[QP_MAX+1];
    uint16_t *cost_mv_fpel[QP_MAX+1][4];
    struct
    {
        uint16_t ref[QP_MAX+1][3][33];
        uint16_t i4x4_mode[QP_MAX+1][17];
    } *cost_table;
    const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
    /* Slice header */
    x264_slice_header_t sh;
    /* SPS / PPS */
    x264_sps_t      sps[1];
    x264_pps_t      pps[1];
    /* Slice header backup, for SEI_DEC_REF_PIC_MARKING */
    int b_sh_backup;
    x264_slice_header_t sh_backup;
    /* cabac context */
    x264_cabac_t    cabac;
    struct
    {
        /* Frames to be encoded (whose types have been decided) */
        x264_frame_t **current;
        /* Unused frames: 0 = fenc, 1 = fdec */
        x264_frame_t **unused[2];
        /* Unused blank frames (for duplicates) */
        x264_frame_t **blank_unused;
        /* frames used for reference + sentinels */
        x264_frame_t *reference[X264_REF_MAX+2];
        int i_last_keyframe;       /* Frame number of the last keyframe */
        int i_last_idr;            /* Frame number of the last IDR (not RP)*/
        int i_poc_last_open_gop;   /* Poc of the I frame of the last open-gop. The value
                                    * is only assigned during the period between that
                                    * I frame and the next P or I frame, else -1 */
        int i_input;    /* Number of input frames already accepted */
        int i_max_dpb;  /* Number of frames allocated in the decoded picture buffer */
        int i_max_ref0;
        int i_max_ref1;
        int i_delay;    /* Number of frames buffered for B reordering */
        int     i_bframe_delay;
        int64_t i_bframe_delay_time;
        int64_t i_first_pts;
        int64_t i_prev_reordered_pts[2];
        int64_t i_largest_pts;
        int64_t i_second_largest_pts;
        int b_have_lowres;  /* Whether 1/2 resolution luma planes are being used */
        int b_have_sub8x8_esa;
    } frames;
    /* current frame being encoded */
    x264_frame_t    *fenc;
    /* frame being reconstructed */
    x264_frame_t    *fdec;
    /* references lists */
    int             i_ref[2];
    x264_frame_t    *fref[2][X264_REF_MAX+3];
    x264_frame_t    *fref_nearest[2];
    int             b_ref_reorder[2];
    /* hrd */
    int initial_cpb_removal_delay;
    int initial_cpb_removal_delay_offset;
    int64_t i_reordered_pts_delay;
    /* Current MB DCT coeffs */
    struct
    {
        ALIGNED_64( dctcoef luma16x16_dc[3][16] );
        ALIGNED_16( dctcoef chroma_dc[2][8] );
        // FIXME share memory?
        ALIGNED_64( dctcoef luma8x8[12][64] );
        ALIGNED_64( dctcoef luma4x4[16*3][16] );
    } dct;
    /* MB table and cache for current frame/mb */
    struct
    {
        int     i_mb_width;
        int     i_mb_height;
        int     i_mb_count;                 /* number of mbs in a frame */
        /* Chroma subsampling */
        int     chroma_h_shift;
        int     chroma_v_shift;
        /* Strides */
        int     i_mb_stride;
        int     i_b8_stride;
        int     i_b4_stride;
        int     left_b8[2];
        int     left_b4[2];
        /* Current index */
        int     i_mb_x;
        int     i_mb_y;
        int     i_mb_xy;
        int     i_b8_xy;
        int     i_b4_xy;
        /* Search parameters */
        int     i_me_method;
        int     i_subpel_refine;
        int     b_chroma_me;
        int     b_trellis;
        int     b_noise_reduction;
        int     b_dct_decimate;
        int     i_psy_rd; /* Psy RD strength--fixed point value*/
        int     i_psy_trellis; /* Psy trellis strength--fixed point value*/
        int     b_interlaced;
        int     b_adaptive_mbaff; /* MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs */
        /* Allowed qpel MV range to stay within the picture + emulated edge pixels */
        int     mv_min[2];
        int     mv_max[2];
        int     mv_miny_row[3]; /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
        int     mv_maxy_row[3];
        /* Subpel MV range for motion search.
         * same mv_min/max but includes levels' i_mv_range. */
        int     mv_min_spel[2];
        int     mv_max_spel[2];
        int     mv_miny_spel_row[3];
        int     mv_maxy_spel_row[3];
        /* Fullpel MV range for motion search */
        ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
        int     mv_miny_fpel_row[3];
        int     mv_maxy_fpel_row[3];
        /* neighboring MBs */
        unsigned int i_neighbour;
        unsigned int i_neighbour8[4];       /* neighbours of each 8x8 or 4x4 block that are available */
        unsigned int i_neighbour4[16];      /* at the time the block is coded */
        unsigned int i_neighbour_intra;     /* for constrained intra pred */
        unsigned int i_neighbour_frame;     /* ignoring slice boundaries */
        int     i_mb_type_top;
        int     i_mb_type_left[2];
        int     i_mb_type_topleft;
        int     i_mb_type_topright;
        int     i_mb_prev_xy;
        int     i_mb_left_xy[2];
        int     i_mb_top_xy;
        int     i_mb_topleft_xy;
        int     i_mb_topright_xy;
        int     i_mb_top_y;
        int     i_mb_topleft_y;
        int     i_mb_topright_y;
        const x264_left_table_t *left_index_table;
        int     i_mb_top_mbpair_xy;
        int     topleft_partition;
        int     b_allow_skip;
        int     field_decoding_flag;
        /**** thread synchronization ends here ****/
        /* subsequent variables are either thread-local or constant,
         * and won't be copied from one thread to another */
        /* mb table */
        uint8_t *base;                      /* base pointer for all malloced data in this mb */
        int8_t  *type;                      /* mb type */
        uint8_t *partition;                 /* mb partition */
        int8_t  *qp;                        /* mb qp */
        int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */
        int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
                                            /* actually has only 7 entries; set to 8 for write-combining optimizations */
        uint8_t (*non_zero_count)[16*3];    /* nzc. for I_PCM set to 16 */
        int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
        int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
        uint8_t (*mvd[2])[8][2];            /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
        int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
        int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
        int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
        int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
        int32_t *slice_table;               /* sh->first_mb of the slice that the indexed mb is part of */
        uint8_t *field;
         /* buffer for weighted versions of the reference frames */
        pixel *p_weight_buf[X264_REF_MAX];
        /* current value */
        int     i_type;
        int     i_partition;
        ALIGNED_4( uint8_t i_sub_partition[4] );
        int     b_transform_8x8;
        int     i_cbp_luma;
        int     i_cbp_chroma;
        int     i_intra16x16_pred_mode;
        int     i_chroma_pred_mode;
        /* skip flags for i4x4 and i8x8
         * 0 = encode as normal.
         * 1 (non-RD only) = the DCT is still in h->dct, restore fdec and skip reconstruction.
         * 2 (RD only) = the DCT has since been overwritten by RD; restore that too. */
        int i_skip_intra;
        /* skip flag for motion compensation */
        /* if we've already done MC, we don't need to do it again */
        int b_skip_mc;
        /* set to true if we are re-encoding a macroblock. */
        int b_reencode_mb;
        int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */
        int b_deblock_rdo;
        int b_overflow; /* If CAVLC had a level code overflow during bitstream writing. */
        struct
        {
            /* space for p_fenc and p_fdec */
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
            ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
            ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
            /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
            ALIGNED_32( pixel i4x4_fdec_buf[16*16] );
            ALIGNED_32( pixel i8x8_fdec_buf[16*16] );
            ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
            ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
            uint32_t i4x4_nnz_buf[4];
            uint32_t i8x8_nnz_buf[4];
            /* Psy trellis DCT data */
            ALIGNED_64( dctcoef fenc_dct8[4][64] );
            ALIGNED_64( dctcoef fenc_dct4[16][16] );
            /* Psy RD SATD/SA8D scores cache */
            ALIGNED_64( uint32_t fenc_satd_cache[32] );
            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
            int i4x4_cbp;
            int i8x8_cbp;
            /* pointer over mb of the frame to be compressed */
            pixel *p_fenc[3]; /* y,u,v */
            /* pointer to the actual source frame, not a block copy */
            pixel *p_fenc_plane[3];
            /* pointer over mb of the frame to be reconstructed  */
            pixel *p_fdec[3];
            /* pointer over mb of the references */
            int i_fref[2];
            /* [12]: yN, yH, yV, yHV, (NV12 ? uv : I444 ? (uN, uH, uV, uHV, vN, ...)) */
            pixel *p_fref[2][X264_REF_MAX*2][12];
            pixel *p_fref_w[X264_REF_MAX*2];  /* weighted fullpel luma */
            uint16_t *p_integral[2][X264_REF_MAX];
            /* fref stride */
            int     i_stride[3];
        } pic;
        /* cache */
        struct
        {
            /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
            ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
            /* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */
            ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] );
            /* -1 if unused, -2 if unavailable */
            ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
            /* 0 if not available */
            ALIGNED_16( int16_t mv[2][X264_SCAN8_LUMA_SIZE][2] );
            ALIGNED_8( uint8_t mvd[2][X264_SCAN8_LUMA_SIZE][2] );
            /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
            ALIGNED_4( int8_t skip[X264_SCAN8_LUMA_SIZE] );
            ALIGNED_4( int16_t direct_mv[2][4][2] );
            ALIGNED_4( int8_t  direct_ref[2][4] );
            int     direct_partition;
            ALIGNED_4( int16_t pskip_mv[2] );
            /* number of neighbors (top and left) that used 8x8 dct */
            int     i_neighbour_transform_size;
            int     i_neighbour_skip;
            /* neighbor CBPs */
            int     i_cbp_top;
            int     i_cbp_left;
            /* extra data required for mbaff in mv prediction */
            int16_t topright_mv[2][3][2];
            int8_t  topright_ref[2][3];
            /* current mb deblock strength */
            uint8_t (*deblock_strength)[8][4];
        } cache;
        /* */
        int     i_qp;       /* current qp */
        int     i_chroma_qp;
        int     i_last_qp;  /* last qp */
        int     i_last_dqp; /* last delta qp */
        int     b_variable_qp; /* whether qp is allowed to vary per macroblock */
        int     b_lossless;
        int     b_direct_auto_read; /* take stats for --direct auto from the 2pass log */
        int     b_direct_auto_write; /* analyse direct modes, to use and/or save */
        /* lambda values */
        int     i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */
        int     i_psy_rd_lambda;
        int     i_chroma_lambda2_offset;
        /* B_direct and weighted prediction */
        int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4];
        int16_t (*dist_scale_factor)[4];
        int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4];
        int8_t (*bipred_weight)[4];
        /* maps fref1[0]'s ref indices into the current list0 */
 #define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
        int8_t  map_col_to_list0[X264_REF_MAX+2];
        int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
        int8_t deblock_ref_table[X264_REF_MAX*2+2];
 #define deblock_ref_table(x) h->mb.deblock_ref_table[(x)+2]
    } mb;
    /* rate control encoding only */
    x264_ratecontrol_t *rc;
    /* stats */
    struct
    {
        /* Cumulated stats */
        /* per slice info */
        int     i_frame_count[3];
        int64_t i_frame_size[3];
        double  f_frame_qp[3];
        int     i_consecutive_bframes[X264_BFRAME_MAX+1];
        /* */
        double  f_ssd_global[3];
        double  f_psnr_average[3];
        double  f_psnr_mean_y[3];
        double  f_psnr_mean_u[3];
        double  f_psnr_mean_v[3];
        double  f_ssim_mean_y[3];
        double  f_frame_duration[3];
        /* */
        int64_t i_mb_count[3][19];
        int64_t i_mb_partition[2][17];
        int64_t i_mb_count_8x8dct[2];
        int64_t i_mb_count_ref[2][2][X264_REF_MAX*2];
        int64_t i_mb_cbp[6];
        int64_t i_mb_pred_mode[4][13];
        int64_t i_mb_field[3];
        /* */
        int     i_direct_score[2];
        int     i_direct_frames[2];
        /* num p-frames weighted */
        int     i_wpred[2];
        /* Current frame stats */
        x264_frame_stat_t frame;
    } stat;
    /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */
    udctcoef (*nr_offset)[64];
    uint32_t (*nr_residual_sum)[64];
    uint32_t *nr_count;
    ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
    ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
    uint32_t nr_count_buf[2][4];
    uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
    /* Buffers that are allocated per-thread even in sliced threads. */
    void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
    void *scratch_buffer2; /* if the first one's already in use */
    pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
    /* Deblock strength values are stored for each 4x4 partition. In MBAFF
     * there are four extra values that need to be stored, located in [4][i]. */
    uint8_t (*deblock_strength[2])[2][8][4];
    /* CPU functions dependents */
    x264_predict_t      predict_16x16[4+3];
    x264_predict8x8_t   predict_8x8[9+3];
    x264_predict_t      predict_4x4[9+3];
    x264_predict_t      predict_chroma[4+3];
    x264_predict_t      predict_8x8c[4+3];
    x264_predict_t      predict_8x16c[4+3];
    x264_predict_8x8_filter_t predict_8x8_filter;
    x264_pixel_function_t pixf;
    x264_mc_functions_t   mc;
    x264_dct_function_t   dctf;
    x264_zigzag_function_t zigzagf;
    x264_zigzag_function_t zigzagf_interlaced;
    x264_zigzag_function_t zigzagf_progressive;
    x264_quant_function_t quantf;
    x264_deblock_function_t loopf;
    x264_bitstream_function_t bsf;
    x264_lookahead_t *lookahead;
 #if HAVE_OPENCL
    x264_opencl_t opencl;
 #endif
 };
 typedef struct
 {
    int sad;
    int16_t mv[2];
 } mvsad_t;
 // included at the end because it needs x264_t
 #include "macroblock.h"
 static ALWAYS_INLINE int x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
 {
    int cnt = 0;
    for( int i = 0; i < i_mvc; i++ )
    {
        int mx = (mvc[i][0] + 2) >> 2;
        int my = (mvc[i][1] + 2) >> 2;
        uint32_t mv = pack16to32_mask(mx, my);
        if( !mv || mv == pmv ) continue;
        dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
        dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
        cnt++;
    }
    return cnt;
 }
 static ALWAYS_INLINE int x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
 {
    int cnt = 0;
    int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
    for( int i = 0; i < i_mvc; i++ )
    {
        uint32_t mv = M32( mvc[i] );
        int mx = mvc[i][0];
        int my = mvc[i][1];
        if( !mv || mv == pmv ) continue;
        dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
        dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
        cnt++;
    }
    return cnt;
 }
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/util.h"
 #endif
 #include "rectangle.h"
 #endif
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -0,0 +1,679 @@
 /*****************************************************************************
 * cpu.c: cpu detection
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Fiona Glaser <fiona@x264.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "base.h"
 #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
 #include <sys/auxv.h>
 #endif
 #if HAVE_SYSCONF
 #include <unistd.h>
 #endif
 #if SYS_LINUX
 #include <sched.h>
 #endif
 #if SYS_BEOS
 #include <kernel/OS.h>
 #endif
 #if SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
 #if SYS_OPENBSD
 #include <machine/cpu.h>
 #endif
 const x264_cpu_name_t x264_cpu_names[] =
 {
 #if ARCH_X86 || ARCH_X86_64
 //  {"MMX",         X264_CPU_MMX},  // we don't support asm on mmx1 cpus anymore
 #define MMX2 X264_CPU_MMX|X264_CPU_MMX2
    {"MMX2",        MMX2},
    {"MMXEXT",      MMX2},
    {"SSE",         MMX2|X264_CPU_SSE},
 #define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
    {"SSE2Slow",    SSE2|X264_CPU_SSE2_IS_SLOW},
    {"SSE2",        SSE2},
    {"SSE2Fast",    SSE2|X264_CPU_SSE2_IS_FAST},
    {"LZCNT",       SSE2|X264_CPU_LZCNT},
    {"SSE3",        SSE2|X264_CPU_SSE3},
    {"SSSE3",       SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
    {"SSE4.1",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
    {"SSE4",        SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
    {"SSE4.2",      SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
 #define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX
    {"AVX",         AVX},
    {"XOP",         AVX|X264_CPU_XOP},
    {"FMA4",        AVX|X264_CPU_FMA4},
    {"FMA3",        AVX|X264_CPU_FMA3},
    {"BMI1",        AVX|X264_CPU_LZCNT|X264_CPU_BMI1},
    {"BMI2",        AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
 #define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
    {"AVX2",        AVX2},
    {"AVX512",      AVX2|X264_CPU_AVX512},
 #undef AVX2
 #undef AVX
 #undef SSE2
 #undef MMX2
    {"Cache32",         X264_CPU_CACHELINE_32},
    {"Cache64",         X264_CPU_CACHELINE_64},
    {"SlowAtom",        X264_CPU_SLOW_ATOM},
    {"SlowPshufb",      X264_CPU_SLOW_PSHUFB},
    {"SlowPalignr",     X264_CPU_SLOW_PALIGNR},
    {"SlowShuffle",     X264_CPU_SLOW_SHUFFLE},
    {"UnalignedStack",  X264_CPU_STACK_MOD4},
 #elif ARCH_PPC
    {"Altivec",         X264_CPU_ALTIVEC},
 #elif ARCH_ARM
    {"ARMv6",           X264_CPU_ARMV6},
    {"NEON",            X264_CPU_NEON},
    {"FastNeonMRC",     X264_CPU_FAST_NEON_MRC},
 #elif ARCH_AARCH64
    {"ARMv8",           X264_CPU_ARMV8},
    {"NEON",            X264_CPU_NEON},
    {"DotProd",         X264_CPU_DOTPROD},
    {"I8MM",            X264_CPU_I8MM},
    {"SVE",             X264_CPU_SVE},
    {"SVE2",            X264_CPU_SVE2},
 #elif ARCH_RISCV64
    {"RVV",             X264_CPU_RVV},
 #elif ARCH_MIPS
    {"MSA",             X264_CPU_MSA},
 #elif ARCH_LOONGARCH
    {"LSX",             X264_CPU_LSX},
    {"LASX",            X264_CPU_LASX},
 #endif
    {"", 0},
 };
 static unsigned long x264_getauxval( unsigned long type )
 {
 #if HAVE_GETAUXVAL
    return getauxval( type );
 #elif HAVE_ELF_AUX_INFO
    unsigned long aux = 0;
    elf_aux_info( type, &aux, sizeof(aux) );
    return aux;
 #else
    return 0;
 #endif
 }
 #if ((HAVE_ALTIVEC && SYS_LINUX) || (HAVE_ARMV6 && !HAVE_NEON)) && !(HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO)
 #include <signal.h>
 #include <setjmp.h>
 static sigjmp_buf jmpbuf;
 static volatile sig_atomic_t canjump = 0;
 static void sigill_handler( int sig )
 {
    if( !canjump )
    {
        signal( sig, SIG_DFL );
        raise( sig );
    }
    canjump = 0;
    siglongjmp( jmpbuf, 1 );
 }
 #endif
 #if HAVE_MMX
 int x264_cpu_cpuid_test( void );
 void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
 uint64_t x264_cpu_xgetbv( int xcr );
 uint32_t x264_cpu_detect( void )
 {
    uint32_t cpu = 0;
    uint32_t eax, ebx, ecx, edx;
    uint32_t vendor[4] = {0};
    uint32_t max_extended_cap, max_basic_cap;
 #if !ARCH_X86_64
    if( !x264_cpu_cpuid_test() )
        return 0;
 #endif
    x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
    if( max_basic_cap == 0 )
        return 0;
    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
    if( edx&0x00800000 )
        cpu |= X264_CPU_MMX;
    else
        return cpu;
    if( edx&0x02000000 )
        cpu |= X264_CPU_MMX2|X264_CPU_SSE;
    if( edx&0x04000000 )
        cpu |= X264_CPU_SSE2;
    if( ecx&0x00000001 )
        cpu |= X264_CPU_SSE3;
    if( ecx&0x00000200 )
        cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
    if( ecx&0x00080000 )
        cpu |= X264_CPU_SSE4;
    if( ecx&0x00100000 )
        cpu |= X264_CPU_SSE42;
    if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
    {
        uint64_t xcr0 = x264_cpu_xgetbv( 0 );
        if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
        {
            if( ecx&0x10000000 )
                cpu |= X264_CPU_AVX;
            if( ecx&0x00001000 )
                cpu |= X264_CPU_FMA3;
            if( max_basic_cap >= 7 )
            {
                x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
                if( ebx&0x00000008 )
                    cpu |= X264_CPU_BMI1;
                if( ebx&0x00000100 )
                    cpu |= X264_CPU_BMI2;
                if( ebx&0x00000020 )
                    cpu |= X264_CPU_AVX2;
                if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
                {
                    if( (ebx&0xD0030000) == 0xD0030000 )
                        cpu |= X264_CPU_AVX512;
                }
            }
        }
    }
    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
    max_extended_cap = eax;
    if( max_extended_cap >= 0x80000001 )
    {
        x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
        if( ecx&0x00000020 )
            cpu |= X264_CPU_LZCNT;             /* Supported by Intel chips starting with Haswell */
        if( ecx&0x00000040 ) /* SSE4a, AMD only */
        {
            int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
            cpu |= X264_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
            if( family == 0x14 )
            {
                cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
                cpu |= X264_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
                cpu |= X264_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
            }
            if( family == 0x16 )
            {
                cpu |= X264_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
                                                * compared to alternate instruction sequences that this
                                                * is equal or faster on almost all such functions. */
            }
        }
        if( cpu & X264_CPU_AVX )
        {
            if( ecx&0x00000800 ) /* XOP */
                cpu |= X264_CPU_XOP;
            if( ecx&0x00010000 ) /* FMA4 */
                cpu |= X264_CPU_FMA4;
        }
        if( !strcmp((char*)vendor, "AuthenticAMD") )
        {
            if( edx&0x00400000 )
                cpu |= X264_CPU_MMX2;
            if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
                cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
        }
    }
    if( !strcmp((char*)vendor, "GenuineIntel") )
    {
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
        if( family == 6 )
        {
            /* Detect Atom CPU */
            if( model == 28 )
            {
                cpu |= X264_CPU_SLOW_ATOM;
                cpu |= X264_CPU_SLOW_PSHUFB;
            }
            /* Conroe has a slow shuffle unit. Check the model number to make sure not
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
            else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
                cpu |= X264_CPU_SLOW_SHUFFLE;
        }
    }
    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
    {
        /* cacheline size is specified in 3 places, any of which may be missing */
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
        int cache = (ebx&0xff00)>>5; // cflush size
        if( !cache && max_extended_cap >= 0x80000006 )
        {
            x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
            cache = ecx&0xff; // cacheline size
        }
        if( !cache && max_basic_cap >= 2 )
        {
            // Cache and TLB Information
            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
            uint32_t buf[4];
            int max, i = 0;
            do {
                x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
                max = buf[0]&0xff;
                buf[0] &= ~0xff;
                for( int j = 0; j < 4; j++ )
                    if( !(buf[j]>>31) )
                        while( buf[j] )
                        {
                            if( strchr( cache32_ids, buf[j]&0xff ) )
                                cache = 32;
                            if( strchr( cache64_ids, buf[j]&0xff ) )
                                cache = 64;
                            buf[j] >>= 8;
                        }
            } while( ++i < max );
        }
        if( cache == 32 )
            cpu |= X264_CPU_CACHELINE_32;
        else if( cache == 64 )
            cpu |= X264_CPU_CACHELINE_64;
        else
            x264_log_internal( X264_LOG_WARNING, "unable to determine cacheline size\n" );
    }
 #if STACK_ALIGNMENT < 16
    cpu |= X264_CPU_STACK_MOD4;
 #endif
    return cpu;
 }
 #elif HAVE_ALTIVEC
 #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
 #define HWCAP_PPC_ALTIVEC   (1U << 28)
 uint32_t x264_cpu_detect( void )
 {
    uint32_t flags = 0;
    unsigned long hwcap = x264_getauxval( AT_HWCAP );
    if ( hwcap & HWCAP_PPC_ALTIVEC )
        flags |= X264_CPU_ALTIVEC;
    return flags;
 }
 #elif SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD
 uint32_t x264_cpu_detect( void )
 {
    /* Thank you VLC */
    uint32_t cpu = 0;
 #if SYS_OPENBSD
    int      selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
 #elif SYS_MACOSX
    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
 #endif
    int      has_altivec = 0;
    size_t   length = sizeof( has_altivec );
 #if SYS_MACOSX || SYS_OPENBSD
    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );
 #elif SYS_NETBSD
    int      error = sysctlbyname( "machdep.altivec", &has_altivec, &length, NULL, 0 );
 #else
    int      error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 );
 #endif
    if( error == 0 && has_altivec != 0 )
        cpu |= X264_CPU_ALTIVEC;
    return cpu;
 }
 #elif SYS_LINUX
 uint32_t x264_cpu_detect( void )
 {
 #ifdef __NO_FPRS__
    return 0;
 #else
    static void (*oldsig)( int );
    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return 0;
    }
    canjump = 1;
    asm volatile( "mtspr 256, %0\n\t"
                  "vand 0, 0, 0\n\t"
                  :
                  : "r"(-1) );
    canjump = 0;
    signal( SIGILL, oldsig );
    return X264_CPU_ALTIVEC;
 #endif
 }
 #else
 uint32_t x264_cpu_detect( void )
 {
    return 0;
 }
 #endif
 #elif HAVE_ARMV6
 void x264_cpu_neon_test( void );
 int x264_cpu_fast_neon_mrc_test( void );
 #define HWCAP_ARM_NEON   (1U << 12)
 uint32_t x264_cpu_detect( void )
 {
    uint32_t flags = 0;
    flags |= X264_CPU_ARMV6;
 #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
    unsigned long hwcap = x264_getauxval( AT_HWCAP );
    if ( hwcap & HWCAP_ARM_NEON )
        flags |= X264_CPU_NEON;
 #else
    // don't do this hack if compiled with -mfpu=neon
 #if !HAVE_NEON
    static void (* oldsig)( int );
    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return flags;
    }
    canjump = 1;
    x264_cpu_neon_test();
    canjump = 0;
    signal( SIGILL, oldsig );
 #endif
    flags |= X264_CPU_NEON;
 #endif
    // fast neon -> arm (Cortex-A9) detection relies on user access to the
    // cycle counter; this assumes ARMv7 performance counters.
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
    // hopefully this hacky detection method will have been replaced by then.
    // Note that there is potential for a race condition if another program or
    // x264 instance disables or reinits the counters while x264 is using them,
    // which may result in incorrect detection and the counters stuck enabled.
    // right now Apple does not seem to support performance counters for this test
    // Don't test this on Windows; performance counters are readable, but
    // the PMNC is not readable.
 #if !defined(__MACH__) && !defined(_WIN32)
    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
 #endif
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
    return flags;
 }
 #elif HAVE_RISCV64
 #define HWCAP_RISCV64_RVV     (1 << ('V' - 'A'))
 uint32_t x264_cpu_detect( void )
 {
    uint32_t flags = 0;
 #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
    unsigned long hwcap = x264_getauxval( AT_HWCAP );
    if ( hwcap & HWCAP_RISCV64_RVV )
        flags |= X264_CPU_RVV;
 #else
 #if HAVE_RVV
    flags |= X264_CPU_RVV;
 #endif
 #endif
    return flags;
 }
 #elif HAVE_AARCH64
 #if defined(__linux__) || HAVE_ELF_AUX_INFO
 #define HWCAP_AARCH64_ASIMDDP (1U << 20)
 #define HWCAP_AARCH64_SVE     (1U << 22)
 #define HWCAP2_AARCH64_SVE2   (1U << 1)
 #define HWCAP2_AARCH64_I8MM   (1U << 13)
 static uint32_t detect_flags( void )
 {
    uint32_t flags = 0;
    unsigned long hwcap = x264_getauxval( AT_HWCAP );
    unsigned long hwcap2 = x264_getauxval( AT_HWCAP2 );
    if ( hwcap & HWCAP_AARCH64_ASIMDDP )
        flags |= X264_CPU_DOTPROD;
    if ( hwcap2 & HWCAP2_AARCH64_I8MM )
        flags |= X264_CPU_I8MM;
    if ( hwcap & HWCAP_AARCH64_SVE )
        flags |= X264_CPU_SVE;
    if ( hwcap2 & HWCAP2_AARCH64_SVE2 )
        flags |= X264_CPU_SVE2;
    return flags;
 }
 #elif defined(__APPLE__)
 #include <sys/sysctl.h>
 static int have_feature( const char *feature )
 {
    int supported = 0;
    size_t size = sizeof(supported);
    if ( sysctlbyname( feature, &supported, &size, NULL, 0 ) )
        return 0;
    return supported;
 }
 static uint32_t detect_flags( void )
 {
    uint32_t flags = 0;
    if ( have_feature( "hw.optional.arm.FEAT_DotProd" ) )
        flags |= X264_CPU_DOTPROD;
    if ( have_feature( "hw.optional.arm.FEAT_I8MM" ) )
        flags |= X264_CPU_I8MM;
    /* No SVE and SVE2 feature detection available on Apple platforms. */
    return flags;
 }
 #elif defined(_WIN32)
 #include <windows.h>
 static uint32_t detect_flags( void )
 {
    uint32_t flags = 0;
 #ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
    if ( IsProcessorFeaturePresent( PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE ) )
        flags |= X264_CPU_DOTPROD;
 #endif
 #ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
    if ( IsProcessorFeaturePresent( PF_ARM_SVE_INSTRUCTIONS_AVAILABLE ) )
        flags |= X264_CPU_SVE;
 #endif
 #ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
    if ( IsProcessorFeaturePresent( PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE ) )
        flags |= X264_CPU_SVE2;
 #endif
 #ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
    /* There's no PF_* flag that indicates whether plain I8MM is available
     * or not. But if SVE_I8MM is available, that also implies that
     * regular I8MM is available. */
    if ( IsProcessorFeaturePresent( PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE ) )
        flags |= X264_CPU_I8MM;
 #endif
    return flags;
 }
 #endif
 uint32_t x264_cpu_detect( void )
 {
    uint32_t flags = X264_CPU_ARMV8;
 #if HAVE_NEON
    flags |= X264_CPU_NEON;
 #endif
    // If these features are enabled unconditionally in the compiler, we can
    // assume that they are available.
 #ifdef __ARM_FEATURE_DOTPROD
    flags |= X264_CPU_DOTPROD;
 #endif
 #ifdef __ARM_FEATURE_MATMUL_INT8
    flags |= X264_CPU_I8MM;
 #endif
 #ifdef __ARM_FEATURE_SVE
    flags |= X264_CPU_SVE;
 #endif
 #ifdef __ARM_FEATURE_SVE2
    flags |= X264_CPU_SVE2;
 #endif
    // Where possible, try to do runtime detection as well.
 #if defined(__linux__) || HAVE_ELF_AUX_INFO || \
    defined(__APPLE__) || defined(_WIN32)
    flags |= detect_flags();
 #endif
    return flags;
 }
 #elif HAVE_MSA
 uint32_t x264_cpu_detect( void )
 {
    return X264_CPU_MSA;
 }
 #elif HAVE_LSX
 #define LA_HWCAP_LSX    ( 1U << 4 )
 #define LA_HWCAP_LASX   ( 1U << 5 )
 uint32_t x264_cpu_detect( void )
 {
    uint32_t flags = 0;
    uint32_t hwcap = (uint32_t)x264_getauxval( AT_HWCAP );
    if( hwcap & LA_HWCAP_LSX )
        flags |= X264_CPU_LSX;
    if( hwcap & LA_HWCAP_LASX )
        flags |= X264_CPU_LASX;
    return flags;
 }
 #else
 uint32_t x264_cpu_detect( void )
 {
    return 0;
 }
 #endif
 int x264_cpu_num_processors( void )
 {
 #if !HAVE_THREAD
    return 1;
 #elif SYS_WINDOWS
    return x264_pthread_num_processors_np();
 #elif SYS_LINUX
    cpu_set_t p_aff;
    memset( &p_aff, 0, sizeof(p_aff) );
    if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
        return 1;
 #if HAVE_CPU_COUNT
    return CPU_COUNT(&p_aff);
 #else
    int np = 0;
    for( size_t bit = 0; bit < 8 * sizeof(p_aff); bit++ )
        np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
    return np;
 #endif
 #elif SYS_BEOS
    system_info info;
    get_system_info( &info );
    return info.cpu_count;
 #elif SYS_MACOSX
    int ncpu;
    size_t length = sizeof( ncpu );
    if( sysctlbyname("hw.logicalcpu", &ncpu, &length, NULL, 0) )
    {
        ncpu = 1;
    }
    return ncpu;
 #elif defined(_SC_NPROCESSORS_ONLN)
    return sysconf( _SC_NPROCESSORS_ONLN );
 #elif defined(_SC_NPROCESSORS_CONF)
    return sysconf( _SC_NPROCESSORS_CONF );
 #else
    return 1;
 #endif
 }
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -0,0 +1,56 @@
 /*****************************************************************************
 * cpu.h: cpu detection
 *****************************************************************************
 * Copyright (C) 2004-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_CPU_H
 #define X264_CPU_H
 X264_API uint32_t x264_cpu_detect( void );
 X264_API int      x264_cpu_num_processors( void );
 void     x264_cpu_emms( void );
 void     x264_cpu_sfence( void );
 #if HAVE_MMX
 /* There is no way to forbid the compiler from using float instructions
 * before the emms so miscompilation could theoretically occur in the
 * unlikely event that the compiler reorders emms and float instructions. */
 #if HAVE_X86_INLINE_ASM
 /* Clobbering memory makes the compiler less likely to reorder code. */
 #define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \
                                  "st(3)","st(4)","st(5)","st(6)","st(7)" )
 #else
 #define x264_emms() x264_cpu_emms()
 #endif
 #else
 #define x264_emms()
 #endif
 #define x264_sfence x264_cpu_sfence
 typedef struct
 {
    const char *name;
    uint32_t flags;
 } x264_cpu_name_t;
 X264_API extern const x264_cpu_name_t x264_cpu_names[];
 #endif
--- a/common/dct.c
+++ b/common/dct.c
--- a/common/dct.h
+++ b/common/dct.h
@@ -0,0 +1,77 @@
 /*****************************************************************************
 * dct.h: transform and zigzag
 *****************************************************************************
 * Copyright (C) 2004-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_DCT_H
 #define X264_DCT_H
 typedef struct
 {
    // pix1  stride = FENC_STRIDE
    // pix2  stride = FDEC_STRIDE
    // p_dst stride = FDEC_STRIDE
    void (*sub4x4_dct) ( dctcoef dct[16], pixel *pix1, pixel *pix2 );
    void (*add4x4_idct)( pixel *p_dst, dctcoef dct[16] );
    void (*sub8x8_dct)    ( dctcoef dct[4][16], pixel *pix1, pixel *pix2 );
    void (*sub8x8_dct_dc) ( dctcoef dct[4], pixel *pix1, pixel *pix2 );
    void (*add8x8_idct)   ( pixel *p_dst, dctcoef dct[4][16] );
    void (*add8x8_idct_dc)( pixel *p_dst, dctcoef dct[4] );
    void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
    void (*sub16x16_dct)    ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
    void (*add16x16_idct)   ( pixel *p_dst, dctcoef dct[16][16] );
    void (*add16x16_idct_dc)( pixel *p_dst, dctcoef dct[16] );
    void (*sub8x8_dct8) ( dctcoef dct[64], pixel *pix1, pixel *pix2 );
    void (*add8x8_idct8)( pixel *p_dst, dctcoef dct[64] );
    void (*sub16x16_dct8) ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
    void (*add16x16_idct8)( pixel *p_dst, dctcoef dct[4][64] );
    void (*dct4x4dc) ( dctcoef d[16] );
    void (*idct4x4dc)( dctcoef d[16] );
    void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
 } x264_dct_function_t;
 typedef struct
 {
    void (*scan_8x8)( dctcoef level[64], dctcoef dct[64] );
    void (*scan_4x4)( dctcoef level[16], dctcoef dct[16] );
    int  (*sub_8x8)  ( dctcoef level[64], const pixel *p_src, pixel *p_dst );
    int  (*sub_4x4)  ( dctcoef level[16], const pixel *p_src, pixel *p_dst );
    int  (*sub_4x4ac)( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
    void (*interleave_8x8_cavlc)( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 } x264_zigzag_function_t;
 #define x264_dct_init x264_template(dct_init)
 void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf );
 #define x264_zigzag_init x264_template(zigzag_init)
 void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );
 #endif
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -0,0 +1,851 @@
 /*****************************************************************************
 * deblock.c: deblocking
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Fiona Glaser <fiona@x264.com>
 *          Henrik Gramner <henrik@gramner.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common.h"
 /* Deblocking filter */
 static const uint8_t i_alpha_table[52+12*3] =
 {
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
    80, 90,101,113,127,144,162,182,203,226,
   255,255,
   255,255,255,255,255,255,255,255,255,255,255,255,
 };
 static const uint8_t i_beta_table[52+12*3] =
 {
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
    18, 18,
    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 };
 static const int8_t i_tc0_table[52+12*3][4] =
 {
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 };
 #define alpha_table(x) i_alpha_table[(x)+24]
 #define beta_table(x)  i_beta_table[(x)+24]
 #define tc0_table(x)   i_tc0_table[(x)+24]
 /* From ffmpeg */
 static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 )
 {
    int p2 = pix[-3*xstride];
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];
    int q2 = pix[ 2*xstride];
    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        int tc = tc0;
        int delta;
        if( abs( p2 - p0 ) < beta )
        {
            if( tc0 )
                pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 );
            tc++;
        }
        if( abs( q2 - q0 ) < beta )
        {
            if( tc0 )
                pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 );
            tc++;
        }
        delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
        pix[-1*xstride] = x264_clip_pixel( p0 + delta );    /* p0' */
        pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
    }
 }
 static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
 {
    for( int i = 0; i < 4; i++ )
    {
        if( tc0[i] < 0 )
        {
            pix += 4*ystride;
            continue;
        }
        for( int d = 0; d < 4; d++, pix += ystride )
            deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
    }
 }
 static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
    for( int d = 0; d < 8; d++, pix += stride )
        deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
 }
 static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
    deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 }
 static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
    deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 }
 static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc )
 {
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];
    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        int delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc );
        pix[-1*xstride] = x264_clip_pixel( p0 + delta );    /* p0' */
        pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
    }
 }
 static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
 {
    for( int i = 0; i < 4; i++ )
    {
        int tc = tc0[i];
        if( tc <= 0 )
        {
            pix += height*ystride;
            continue;
        }
        for( int d = 0; d < height; d++, pix += ystride-2 )
            for( int e = 0; e < 2; e++, pix++ )
                deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
    }
 }
 static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
    deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
 }
 static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
    deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
 }
 static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
    deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
 }
 static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
    deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
 }
 static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
 {
    int p2 = pix[-3*xstride];
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];
    int q2 = pix[ 2*xstride];
    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        if( abs( p0 - q0 ) < ((alpha >> 2) + 2) )
        {
            if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */
            {
                const int p3 = pix[-4*xstride];
                pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
                pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
                pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
            }
            else /* p0' */
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
            if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */
            {
                const int q3 = pix[3*xstride];
                pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
                pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
                pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
            }
            else /* q0' */
                pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
        }
        else /* p0', q0' */
        {
            pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
            pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
        }
    }
 }
 static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta )
 {
    for( int d = 0; d < 16; d++, pix += ystride )
        deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
 }
 static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta )
 {
    for( int d = 0; d < 8; d++, pix += ystride )
        deblock_edge_luma_intra_c( pix, 1, alpha, beta );
 }
 static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
    deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 }
 static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
    deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 }
 static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
 {
    int p1 = pix[-2*xstride];
    int p0 = pix[-1*xstride];
    int q0 = pix[ 0*xstride];
    int q1 = pix[ 1*xstride];
    if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
    {
        pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2;   /* p0' */
        pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
    }
 }
 static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta )
 {
    for( int d = 0; d < height; d++, pix += ystride-2 )
        for( int e = 0; e < width; e++, pix++ )
            deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
 }
 static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
    deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
 }
 static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
    deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
 }
 static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
    deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
 }
 static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
    deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
 }
 static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
                                int bframe )
 {
    for( int dir = 0; dir < 2; dir++ )
    {
        int s1 = dir ? 1 : 8;
        int s2 = dir ? 8 : 1;
        for( int edge = 0; edge < 4; edge++ )
            for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 )
            {
                int locn = loc - s2;
                if( nnz[loc] || nnz[locn] )
                    bs[dir][edge][i] = 2;
                else if( ref[0][loc] != ref[0][locn] ||
                         abs( mv[0][loc][0] - mv[0][locn][0] ) >= 4 ||
                         abs( mv[0][loc][1] - mv[0][locn][1] ) >= mvy_limit ||
                        (bframe && (ref[1][loc] != ref[1][locn] ||
                         abs( mv[1][loc][0] - mv[1][locn][0] ) >= 4 ||
                         abs( mv[1][loc][1] - mv[1][locn][1] ) >= mvy_limit )))
                {
                    bs[dir][edge][i] = 1;
                }
                else
                    bs[dir][edge][i] = 0;
            }
    }
 }
 static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
                                        int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
 {
    int index_a = i_qp + a;
    int index_b = i_qp + b;
    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
    int8_t tc[4];
    if( !M32(bS) || !alpha || !beta )
        return;
    tc[0] = (tc0_table(index_a)[bS[0]] * (1 << (BIT_DEPTH-8))) + b_chroma;
    tc[1] = (tc0_table(index_a)[bS[1]] * (1 << (BIT_DEPTH-8))) + b_chroma;
    tc[2] = (tc0_table(index_a)[bS[2]] * (1 << (BIT_DEPTH-8))) + b_chroma;
    tc[3] = (tc0_table(index_a)[bS[3]] * (1 << (BIT_DEPTH-8))) + b_chroma;
    pf_inter( pix, i_stride, alpha, beta, tc );
 }
 static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
                                              int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
 {
    int index_a = i_qp + a;
    int index_b = i_qp + b;
    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
    if( !alpha || !beta )
        return;
    pf_intra( pix, i_stride, alpha, beta );
 }
 static ALWAYS_INLINE void macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
 {
    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
    h->mb.i_neighbour = 0;
    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
    h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy];
    h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED);
    h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y;
    h->mb.i_mb_left_xy[1] =
    h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
    if( SLICE_MBAFF )
    {
        if( mb_y&1 )
        {
            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
                h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
        }
        else
        {
            if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] )
            {
                h->mb.i_mb_top_xy += h->mb.i_mb_stride;
                h->mb.i_mb_top_y++;
            }
            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
                h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
        }
    }
    if( mb_x > 0 && (deblock_on_slice_edges ||
        h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) )
        h->mb.i_neighbour |= MB_LEFT;
    if( mb_y > MB_INTERLACED && (deblock_on_slice_edges
        || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) )
        h->mb.i_neighbour |= MB_TOP;
 }
 void x264_frame_deblock_row( x264_t *h, int mb_y )
 {
    int b_interlaced = SLICE_MBAFF;
    int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
    int b = h->sh.i_beta_offset - QP_BD_OFFSET;
    int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
    int stridey   = h->fdec->i_stride[0];
    int strideuv  = h->fdec->i_stride[1];
    int chroma_format = CHROMA_FORMAT;
    int chroma444 = CHROMA444;
    int chroma_height = 16 >> CHROMA_V_SHIFT;
    intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
    for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
    {
        x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
        macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
        int mb_xy = h->mb.i_mb_xy;
        int transform_8x8 = h->mb.mb_transform_size[mb_xy];
        int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
        uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x];
        pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
        pixel *pixuv = CHROMA_FORMAT ? h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x : NULL;
        if( mb_y & MB_INTERLACED )
        {
            pixy -= 15*stridey;
            if( CHROMA_FORMAT )
                pixuv -= (chroma_height-1)*strideuv;
        }
        int stride2y  = stridey << MB_INTERLACED;
        int stride2uv = strideuv << MB_INTERLACED;
        int qp = h->mb.qp[mb_xy];
        int qpc = h->chroma_qp_table[qp];
        int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh;
        #define FILTER( intra, dir, edge, qp, chroma_qp )\
        do\
        {\
            if( !(edge & 1) || !transform_8x8 )\
            {\
                deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
                                     stride2y, bs[dir][edge], qp, a, b, 0,\
                                     h->loopf.deblock_luma##intra[dir] );\
                if( chroma_format == CHROMA_444 )\
                {\
                    deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
                                         h->loopf.deblock_luma##intra[dir] );\
                    deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
                                         h->loopf.deblock_luma##intra[dir] );\
                }\
                else if( chroma_format == CHROMA_420 && !(edge & 1) )\
                {\
                    deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
                                         stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
                                         h->loopf.deblock_chroma##intra[dir] );\
                }\
            }\
            if( chroma_format == CHROMA_422 && (dir || !(edge & 1)) )\
            {\
                deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
                                     stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
                                     h->loopf.deblock_chroma##intra[dir] );\
            }\
        } while( 0 )
        if( h->mb.i_neighbour & MB_LEFT )
        {
            if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
            {
                int luma_qp[2];
                int chroma_qp[2];
                int left_qp[2];
                x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
                x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
                x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
                x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
                int c = chroma444 ? 0 : 1;
                left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
                luma_qp[0] = (qp + left_qp[0] + 1) >> 1;
                chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1;
                if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] ) )
                {
                    deblock_edge_intra( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   a, b, 0, luma_intra_deblock );
                    if( chroma_format )
                    {
                        deblock_edge_intra( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
                        if( chroma444 )
                            deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock );
                    }
                }
                else
                {
                    deblock_edge( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   a, b, 0, luma_deblock );
                    if( chroma_format )
                    {
                        deblock_edge( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
                        if( chroma444 )
                            deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock );
                    }
                }
                int offy = MB_INTERLACED ? 4 : 0;
                int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0;
                left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
                luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
                chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
                if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[1]] ) )
                {
                    deblock_edge_intra( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   a, b, 0, luma_intra_deblock );
                    if( chroma_format )
                    {
                        deblock_edge_intra( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
                        if( chroma444 )
                            deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_intra_deblock );
                    }
                }
                else
                {
                    deblock_edge( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   a, b, 0, luma_deblock );
                    if( chroma_format )
                    {
                        deblock_edge( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
                        if( chroma444 )
                            deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], a, b, c, chroma_deblock );
                    }
                }
            }
            else
            {
                int qpl = h->mb.qp[h->mb.i_mb_xy-1];
                int qp_left = (qp + qpl + 1) >> 1;
                int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1;
                int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] );
                int intra_deblock = intra_cur || intra_left;
                /* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP.
                 * But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there.
                 * So reset their effective QP to max, to indicate that lack of guarantee. */
                if( h->fdec->mb_info && M32( bs[0][0] ) )
                {
 #define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT);
                    RESET_EFFECTIVE_QP(mb_xy);
                    RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]);
                }
                if( intra_deblock )
                    FILTER( _intra, 0, 0, qp_left, qpc_left );
                else
                    FILTER(       , 0, 0, qp_left, qpc_left );
            }
        }
        if( !first_edge_only )
        {
            FILTER( , 0, 1, qp, qpc );
            FILTER( , 0, 2, qp, qpc );
            FILTER( , 0, 3, qp, qpc );
        }
        if( h->mb.i_neighbour & MB_TOP )
        {
            if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] )
            {
                int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride;
                for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
                {
                    int qpt = h->mb.qp[mbn_xy];
                    int qp_top = (qp + qpt + 1) >> 1;
                    int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
                    int intra_top = IS_INTRA( h->mb.type[mbn_xy] );
                    if( intra_cur || intra_top )
                        M32( bs[1][4*j] ) = 0x03030303;
                    // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows
                    deblock_edge( h, pixy      + j*stridey,  2* stridey, bs[1][4*j], qp_top, a, b, 0, h->loopf.deblock_luma[1] );
                    if( chroma444 )
                    {
                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
                        deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] );
                    }
                    else if( chroma_format )
                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, h->loopf.deblock_chroma[1] );
                }
            }
            else
            {
                int qpt = h->mb.qp[h->mb.i_mb_top_xy];
                int qp_top = (qp + qpt + 1) >> 1;
                int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1;
                int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] );
                int intra_deblock = intra_cur || intra_top;
                /* This edge has been modified, reset effective qp to max. */
                if( h->fdec->mb_info && M32( bs[1][0] ) )
                {
                    RESET_EFFECTIVE_QP(mb_xy);
                    RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy);
                }
                if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) && intra_deblock )
                {
                    FILTER( _intra, 1, 0, qp_top, qpc_top );
                }
                else
                {
                    if( intra_deblock )
                        M32( bs[1][0] ) = 0x03030303;
                    FILTER(       , 1, 0, qp_top, qpc_top );
                }
            }
        }
        if( !first_edge_only )
        {
            FILTER( , 1, 1, qp, qpc );
            FILTER( , 1, 2, qp, qpc );
            FILTER( , 1, 3, qp, qpc );
        }
        #undef FILTER
    }
 }
 /* For deblock-aware RD.
 * TODO:
 *  deblock macroblock edges
 *  support analysis partitions smaller than 16x16
 *  deblock chroma for 4:2:0/4:2:2
 *  handle duplicate refs correctly
 */
 void x264_macroblock_deblock( x264_t *h )
 {
    int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET;
    int b = h->sh.i_beta_offset - QP_BD_OFFSET;
    int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
    int intra_cur = IS_INTRA( h->mb.i_type );
    int qp = h->mb.i_qp;
    int qpc = h->mb.i_chroma_qp;
    if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh )
        return;
    uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
    if( intra_cur )
    {
        M32( bs[0][1] ) = 0x03030303;
        M64( bs[0][2] ) = 0x0303030303030303ULL;
        M32( bs[1][1] ) = 0x03030303;
        M64( bs[1][2] ) = 0x0303030303030303ULL;
    }
    else
        h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
                                   bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );
    int transform_8x8 = h->mb.b_transform_8x8;
    #define FILTER( dir, edge )\
    do\
    {\
        deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\
                      FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\
                      h->loopf.deblock_luma[dir] );\
        if( CHROMA444 )\
        {\
            deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\
                          FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
                          h->loopf.deblock_luma[dir] );\
            deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\
                          FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\
                          h->loopf.deblock_luma[dir] );\
        }\
    } while( 0 )
    if( !transform_8x8 ) FILTER( 0, 1 );
    FILTER( 0, 2 );
    if( !transform_8x8 ) FILTER( 0, 3 );
    if( !transform_8x8 ) FILTER( 1, 1 );
    FILTER( 1, 2 );
    if( !transform_8x8 ) FILTER( 1, 3 );
    #undef FILTER
 }
 #if HAVE_MMX
 #include "x86/deblock.h"
 #endif
 #if HAVE_ALTIVEC
 #include "ppc/deblock.h"
 #endif
 #if HAVE_ARMV6
 #include "arm/deblock.h"
 #endif
 #if HAVE_AARCH64
 #include "aarch64/deblock.h"
 #endif
 #if HAVE_MSA
 #include "mips/deblock.h"
 #endif
 #if HAVE_LSX
 #include "loongarch/deblock.h"
 #endif
 void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
 {
    pf->deblock_luma[1] = deblock_v_luma_c;
    pf->deblock_luma[0] = deblock_h_luma_c;
    pf->deblock_chroma[1] = deblock_v_chroma_c;
    pf->deblock_h_chroma_420 = deblock_h_chroma_c;
    pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
    pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
    pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
    pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
    pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
    pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
    pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
    pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
    pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
    pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
    pf->deblock_strength = deblock_strength_c;
 #if HAVE_MMX
    if( cpu&X264_CPU_MMX2 )
    {
 #if ARCH_X86
        pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
        pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
        pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2;
        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2;
        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
 #endif
 #if !HIGH_BIT_DEPTH
        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
 #endif
        if( cpu&X264_CPU_SSE2 )
        {
            pf->deblock_strength = x264_deblock_strength_sse2;
            pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
            pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
            pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2;
            pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2;
            pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
            pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
            pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
            pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
            if( !(cpu&X264_CPU_STACK_MOD4) )
            {
                pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
                pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
 #if HIGH_BIT_DEPTH
                pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2;
 #endif
            }
        }
        if( cpu&X264_CPU_SSSE3 )
            pf->deblock_strength = x264_deblock_strength_ssse3;
        if( cpu&X264_CPU_AVX )
        {
            pf->deblock_strength = x264_deblock_strength_avx;
            pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
            pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
            pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx;
            pf->deblock_luma[1] = x264_deblock_v_luma_avx;
            pf->deblock_luma[0] = x264_deblock_h_luma_avx;
            pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
            pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
            if( !(cpu&X264_CPU_STACK_MOD4) )
            {
                pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
                pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
                pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
 #if HIGH_BIT_DEPTH
                pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx;
                pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx;
 #endif
            }
        }
        if( cpu&X264_CPU_AVX2 )
        {
            pf->deblock_strength = x264_deblock_strength_avx2;
        }
        if( cpu&X264_CPU_AVX512 )
        {
            pf->deblock_strength = x264_deblock_strength_avx512;
        }
    }
 #endif
 #if !HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
    {
        pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
        pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
    }
 #endif // HAVE_ALTIVEC
 #if HAVE_ARMV6 || HAVE_AARCH64
    if( cpu&X264_CPU_NEON )
    {
        pf->deblock_luma[1] = x264_deblock_v_luma_neon;
        pf->deblock_luma[0] = x264_deblock_h_luma_neon;
        pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
        pf->deblock_strength     = x264_deblock_strength_neon;
    }
 #if HAVE_SVE
    if ( cpu&X264_CPU_SVE )
    {
        pf->deblock_chroma[1] = x264_deblock_v_chroma_sve;
    }
 #endif
 #endif
 #if HAVE_MSA
    if( cpu&X264_CPU_MSA )
    {
        pf->deblock_luma[1] = x264_deblock_v_luma_msa;
        pf->deblock_luma[0] = x264_deblock_h_luma_msa;
        pf->deblock_chroma[1] = x264_deblock_v_chroma_msa;
        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa;
        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa;
        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa;
        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa;
        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa;
        pf->deblock_strength = x264_deblock_strength_msa;
    }
 #endif
 #if HAVE_LSX
    if( cpu&X264_CPU_LSX )
    {
        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lsx;
        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lsx;
        pf->deblock_strength = x264_deblock_strength_lsx;
    }
    if( cpu&X264_CPU_LASX )
    {
        pf->deblock_luma[1] = x264_deblock_v_luma_lasx;
        pf->deblock_luma[0] = x264_deblock_h_luma_lasx;
        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lasx;
        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lasx;
        pf->deblock_strength = x264_deblock_strength_lasx;
    }
 #endif
 #endif // !HIGH_BIT_DEPTH
    /* These functions are equivalent, so don't duplicate them. */
    pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420;
    pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra;
 }
--- a/common/frame.c
+++ b/common/frame.c
@@ -0,0 +1,898 @@
 /*****************************************************************************
 * frame.c: frame handling
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Fiona Glaser <fiona@x264.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common.h"
 static int align_stride( int x, int align, int disalign )
 {
    x = ALIGN( x, align );
    if( !(x&(disalign-1)) )
        x += align;
    return x;
 }
 static int align_plane_size( int x, int disalign )
 {
    if( !(x&(disalign-1)) )
        x += X264_MAX( 128, NATIVE_ALIGN ) / SIZEOF_PIXEL;
    return x;
 }
 static int frame_internal_csp( int external_csp )
 {
    int csp = external_csp & X264_CSP_MASK;
    if( csp == X264_CSP_I400 )
        return X264_CSP_I400;
    if( csp >= X264_CSP_I420 && csp < X264_CSP_I422 )
        return X264_CSP_NV12;
    if( csp >= X264_CSP_I422 && csp < X264_CSP_I444 )
        return X264_CSP_NV16;
    if( csp >= X264_CSP_I444 && csp <= X264_CSP_RGB )
        return X264_CSP_I444;
    return X264_CSP_NONE;
 }
 static x264_frame_t *frame_new( x264_t *h, int b_fdec )
 {
    x264_frame_t *frame;
    int i_csp = frame_internal_csp( h->param.i_csp );
    int i_mb_count = h->mb.i_mb_count;
    int i_stride, i_width, i_lines, luma_plane_count;
    int i_padv = PADV << PARAM_INTERLACED;
    int align = NATIVE_ALIGN / SIZEOF_PIXEL;
 #if ARCH_X86 || ARCH_X86_64
    if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
        align = 64 / SIZEOF_PIXEL;
    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
        align = 32 / SIZEOF_PIXEL;
    else
        align = 16 / SIZEOF_PIXEL;
 #endif
 #if ARCH_PPC
    int disalign = (1<<9) / SIZEOF_PIXEL;
 #else
    int disalign = (1<<10) / SIZEOF_PIXEL;
 #endif
    CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
    PREALLOC_INIT
    /* allocate frame data (+64 for extra data for me) */
    i_width  = h->mb.i_mb_width*16;
    i_lines  = h->mb.i_mb_height*16;
    i_stride = align_stride( i_width + PADH2, align, disalign );
    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
    {
        luma_plane_count = 1;
        frame->i_plane = 2;
        for( int i = 0; i < 2; i++ )
        {
            frame->i_width[i] = i_width >> i;
            frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
            frame->i_stride[i] = i_stride;
        }
    }
    else if( i_csp == X264_CSP_I444 )
    {
        luma_plane_count = 3;
        frame->i_plane = 3;
        for( int i = 0; i < 3; i++ )
        {
            frame->i_width[i] = i_width;
            frame->i_lines[i] = i_lines;
            frame->i_stride[i] = i_stride;
        }
    }
    else if( i_csp == X264_CSP_I400 )
    {
        luma_plane_count = 1;
        frame->i_plane = 1;
        frame->i_width[0] = i_width;
        frame->i_lines[0] = i_lines;
        frame->i_stride[0] = i_stride;
    }
    else
        goto fail;
    frame->i_csp = i_csp;
    frame->i_width_lowres = frame->i_width[0]/2;
    frame->i_lines_lowres = frame->i_lines[0]/2;
    frame->i_stride_lowres = align_stride( frame->i_width_lowres + PADH2, align, disalign<<1 );
    for( int i = 0; i < h->param.i_bframe + 2; i++ )
        for( int j = 0; j < h->param.i_bframe + 2; j++ )
            PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
    frame->i_poc = -1;
    frame->i_type = X264_TYPE_AUTO;
    frame->i_qpplus1 = X264_QP_AUTO;
    frame->i_pts = -1;
    frame->i_frame = -1;
    frame->i_frame_num = -1;
    frame->i_lines_completed = -1;
    frame->b_fdec = b_fdec;
    frame->i_pic_struct = PIC_STRUCT_AUTO;
    frame->i_field_cnt = -1;
    frame->i_duration =
    frame->i_cpb_duration =
    frame->i_dpb_output_delay =
    frame->i_cpb_delay = 0;
    frame->i_coded_fields_lookahead =
    frame->i_cpb_delay_lookahead = -1;
    frame->orig = frame;
    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
    {
        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
        PREALLOC( frame->buffer[1], chroma_plane_size * SIZEOF_PIXEL );
        if( PARAM_INTERLACED )
            PREALLOC( frame->buffer_fld[1], chroma_plane_size * SIZEOF_PIXEL );
    }
    /* all 4 luma planes allocated together, since the cacheline split code
     * requires them to be in-phase wrt cacheline alignment. */
    for( int p = 0; p < luma_plane_count; p++ )
    {
        int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
        if( h->param.analyse.i_subpel_refine && b_fdec )
            luma_plane_size *= 4;
        /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
        PREALLOC( frame->buffer[p], luma_plane_size * SIZEOF_PIXEL );
        if( PARAM_INTERLACED )
            PREALLOC( frame->buffer_fld[p], luma_plane_size * SIZEOF_PIXEL );
    }
    frame->b_duplicate = 0;
    if( b_fdec ) /* fdec frame */
    {
        PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) );
        PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) );
        PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
        PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
        PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
        if( h->param.i_bframe )
        {
            PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
            PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
        }
        else
        {
            frame->mv[1]  = NULL;
            frame->ref[1] = NULL;
        }
        PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
        PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
        PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
        if( h->param.analyse.i_me_method >= X264_ME_ESA )
            PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
        if( PARAM_INTERLACED )
            PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
        if( h->param.analyse.b_mb_info )
            PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
    }
    else /* fenc frame */
    {
        if( h->frames.b_have_lowres )
        {
            int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
            PREALLOC( frame->buffer_lowres, 4 * luma_plane_size * SIZEOF_PIXEL );
            for( int j = 0; j <= !!h->param.i_bframe; j++ )
                for( int i = 0; i <= h->param.i_bframe; i++ )
                {
                    PREALLOC( frame->lowres_mvs[j][i], 2*i_mb_count*sizeof(int16_t) );
                    PREALLOC( frame->lowres_mv_costs[j][i], i_mb_count*sizeof(int) );
                }
            PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
            for( int j = 0; j <= h->param.i_bframe+1; j++ )
                for( int i = 0; i <= h->param.i_bframe+1; i++ )
                    PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
        }
        if( h->param.rc.i_aq_mode )
        {
            PREALLOC( frame->f_qp_offset, i_mb_count * sizeof(float) );
            PREALLOC( frame->f_qp_offset_aq, i_mb_count * sizeof(float) );
            if( h->frames.b_have_lowres )
                PREALLOC( frame->i_inv_qscale_factor, i_mb_count * sizeof(uint16_t) );
        }
        /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
        if( h->frames.b_have_lowres )
            prealloc_size += NATIVE_ALIGN;
    }
    PREALLOC_END( frame->base );
    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
    {
        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
        if( PARAM_INTERLACED )
            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
    }
    for( int p = 0; p < luma_plane_count; p++ )
    {
        int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
        if( h->param.analyse.i_subpel_refine && b_fdec )
        {
            for( int i = 0; i < 4; i++ )
            {
                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
                if( PARAM_INTERLACED )
                    frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
            }
            frame->plane[p] = frame->filtered[p][0];
            frame->plane_fld[p] = frame->filtered_fld[p][0];
        }
        else
        {
            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
            if( PARAM_INTERLACED )
                frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
        }
    }
    if( b_fdec )
    {
        M32( frame->mv16x16[0] ) = 0;
        frame->mv16x16++;
        if( h->param.analyse.i_me_method >= X264_ME_ESA )
            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH_ALIGN;
    }
    else
    {
        if( h->frames.b_have_lowres )
        {
            int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
            for( int i = 0; i < 4; i++ )
                frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH_ALIGN + i * luma_plane_size;
            for( int j = 0; j <= !!h->param.i_bframe; j++ )
                for( int i = 0; i <= h->param.i_bframe; i++ )
                    memset( frame->lowres_mvs[j][i], 0, 2*i_mb_count*sizeof(int16_t) );
            frame->i_intra_cost = frame->lowres_costs[0][0];
            memset( frame->i_intra_cost, -1, i_mb_count * sizeof(uint16_t) );
            if( h->param.rc.i_aq_mode )
                /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
                memset( frame->i_inv_qscale_factor, 0, i_mb_count * sizeof(uint16_t) );
        }
    }
    if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
        goto fail;
    if( x264_pthread_cond_init( &frame->cv, NULL ) )
        goto fail;
 #if HAVE_OPENCL
    frame->opencl.ocl = h->opencl.ocl;
 #endif
    return frame;
 fail:
    x264_free( frame );
    return NULL;
 }
 void x264_frame_delete( x264_frame_t *frame )
 {
    /* Duplicate frames are blank copies of real frames (including pointers),
     * so freeing those pointers would cause a double free later. */
    if( !frame->b_duplicate )
    {
        x264_free( frame->base );
        if( frame->param && frame->param->param_free )
        {
            x264_param_cleanup( frame->param );
            frame->param->param_free( frame->param );
        }
        if( frame->mb_info_free )
            frame->mb_info_free( frame->mb_info );
        if( frame->extra_sei.sei_free )
        {
            for( int i = 0; i < frame->extra_sei.num_payloads; i++ )
                frame->extra_sei.sei_free( frame->extra_sei.payloads[i].payload );
            frame->extra_sei.sei_free( frame->extra_sei.payloads );
        }
        x264_pthread_mutex_destroy( &frame->mutex );
        x264_pthread_cond_destroy( &frame->cv );
 #if HAVE_OPENCL
        x264_opencl_frame_delete( frame );
 #endif
    }
    x264_free( frame );
 }
 static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
 {
    int width = h->param.i_width >> xshift;
    int height = h->param.i_height >> yshift;
    *pix = src->img.plane[plane];
    *stride = src->img.i_stride[plane];
    if( src->img.i_csp & X264_CSP_VFLIP )
    {
        *pix += (height-1) * *stride;
        *stride = -*stride;
    }
    if( width > abs(*stride) )
    {
        x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
        return -1;
    }
    return 0;
 }
 #define get_plane_ptr(...) do { if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; } while( 0 )
 int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 {
    int i_csp = src->img.i_csp & X264_CSP_MASK;
    if( dst->i_csp != frame_internal_csp( i_csp ) )
    {
        x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" );
        return -1;
    }
 #if HIGH_BIT_DEPTH
    if( !(src->img.i_csp & X264_CSP_HIGH_DEPTH) )
    {
        x264_log( h, X264_LOG_ERROR, "This build of x264 requires high depth input. Rebuild to support 8-bit input.\n" );
        return -1;
    }
 #else
    if( src->img.i_csp & X264_CSP_HIGH_DEPTH )
    {
        x264_log( h, X264_LOG_ERROR, "This build of x264 requires 8-bit input. Rebuild to support high depth input.\n" );
        return -1;
    }
 #endif
    if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
    {
        x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
        return -1;
    }
    if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME )
    {
        x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input );
        dst->i_forced_type = X264_TYPE_AUTO;
    }
    else
        dst->i_forced_type = src->i_type;
    dst->i_type     = dst->i_forced_type;
    dst->i_qpplus1  = src->i_qpplus1;
    dst->i_pts      = dst->i_reordered_pts = src->i_pts;
    dst->param      = src->param;
    dst->i_pic_struct = src->i_pic_struct;
    dst->extra_sei  = src->extra_sei;
    dst->opaque     = src->opaque;
    dst->mb_info    = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL;
    dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL;
    uint8_t *pix[3];
    int stride[3];
    if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY )
    {
        int p = i_csp == X264_CSP_UYVY;
        h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1],
                                            (pixel*)src->img.plane[0], src->img.i_stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
    }
    else if( i_csp == X264_CSP_V210 )
    {
         stride[0] = src->img.i_stride[0];
         pix[0] = src->img.plane[0];
         h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
                                             dst->plane[1], dst->i_stride[1],
                                             (uint32_t *)pix[0], stride[0]/(int)sizeof(uint32_t), h->param.i_width, h->param.i_height );
    }
    else if( i_csp >= X264_CSP_BGR )
    {
         stride[0] = src->img.i_stride[0];
         pix[0] = src->img.plane[0];
         if( src->img.i_csp & X264_CSP_VFLIP )
         {
             pix[0] += (h->param.i_height-1) * stride[0];
             stride[0] = -stride[0];
         }
         int b = i_csp==X264_CSP_RGB;
         h->mc.plane_copy_deinterleave_rgb( dst->plane[1+b], dst->i_stride[1+b],
                                            dst->plane[0], dst->i_stride[0],
                                            dst->plane[2-b], dst->i_stride[2-b],
                                            (pixel*)pix[0], stride[0]/SIZEOF_PIXEL, i_csp==X264_CSP_BGRA ? 4 : 3, h->param.i_width, h->param.i_height );
    }
    else
    {
        int v_shift = CHROMA_V_SHIFT;
        get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
        h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
                          stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
        if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
        {
            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
            h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                              stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height>>v_shift );
        }
        else if( i_csp == X264_CSP_NV21 )
        {
            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
            h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                                   stride[1]/SIZEOF_PIXEL, h->param.i_width>>1, h->param.i_height>>v_shift );
        }
        else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
        {
            int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
            get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
            get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
            h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
                                         (pixel*)pix[1], stride[1]/SIZEOF_PIXEL,
                                         (pixel*)pix[2], stride[2]/SIZEOF_PIXEL,
                                         h->param.i_width>>1, h->param.i_height>>v_shift );
        }
        else if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
        {
            get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 );
            get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 );
            h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                              stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
            h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2],
                              stride[2]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height );
        }
    }
    return 0;
 }
 static ALWAYS_INLINE void pixel_memset( pixel *dst, pixel *src, int len, int size )
 {
    uint8_t *dstp = (uint8_t*)dst;
    uint32_t v1 = *src;
    uint32_t v2 = size == 1 ? v1 + (v1 <<  8) : M16( src );
    uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src );
    int i = 0;
    len *= size;
    /* Align the input pointer if it isn't already */
    if( (intptr_t)dstp & (WORD_SIZE - 1) )
    {
        if( size <= 2 && ((intptr_t)dstp & 3) )
        {
            if( size == 1 && ((intptr_t)dstp & 1) )
                dstp[i++] = v1;
            if( (intptr_t)dstp & 2 )
            {
                M16( dstp+i ) = v2;
                i += 2;
            }
        }
        if( WORD_SIZE == 8 && (intptr_t)dstp & 4 )
        {
            M32( dstp+i ) = v4;
            i += 4;
        }
    }
    /* Main copy loop */
    if( WORD_SIZE == 8 )
    {
        uint64_t v8 = v4 + ((uint64_t)v4<<32);
        for( ; i < len - 7; i+=8 )
            M64( dstp+i ) = v8;
    }
    for( ; i < len - 3; i+=4 )
        M32( dstp+i ) = v4;
    /* Finish up the last few bytes */
    if( size <= 2 )
    {
        if( i < len - 1 )
        {
            M16( dstp+i ) = v2;
            i += 2;
        }
        if( size == 1 && i != len )
            dstp[i] = v1;
    }
 }
 static ALWAYS_INLINE void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
 {
 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
    for( int y = 0; y < i_height; y++ )
    {
        /* left band */
        pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, SIZEOF_PIXEL<<b_chroma );
        /* right band */
        pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, SIZEOF_PIXEL<<b_chroma );
    }
    /* upper band */
    if( b_pad_top )
        for( int y = 0; y < i_padv; y++ )
            memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * SIZEOF_PIXEL );
    /* lower band */
    if( b_pad_bottom )
        for( int y = 0; y < i_padv; y++ )
            memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * SIZEOF_PIXEL );
 #undef PPIXEL
 }
 void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y )
 {
    int pad_top = mb_y == 0;
    int pad_bot = mb_y == h->mb.i_mb_height - (1 << SLICE_MBAFF);
    int b_start = mb_y == h->i_threadslice_start;
    int b_end   = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF);
    if( mb_y & SLICE_MBAFF )
        return;
    for( int i = 0; i < frame->i_plane; i++ )
    {
        int h_shift = i && CHROMA_H_SHIFT;
        int v_shift = i && CHROMA_V_SHIFT;
        int stride = frame->i_stride[i];
        int width = 16*h->mb.i_mb_width;
        int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
        int padh = PADH;
        int padv = PADV >> v_shift;
        // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
        if( b_end && !b_start )
            height += 4 >> (v_shift + SLICE_MBAFF);
        pixel *pix;
        int starty = 16*mb_y - 4*!b_start;
        if( SLICE_MBAFF )
        {
            // border samples for each field are extended separately
            pix = frame->plane_fld[i] + (starty*stride >> v_shift);
            plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
            height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
            if( b_end && !b_start )
                height += 4 >> v_shift;
            pix = frame->plane[i] + (starty*stride >> v_shift);
            plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
        }
        else
        {
            pix = frame->plane[i] + (starty*stride >> v_shift);
            plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
        }
    }
 }
 void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 {
    /* during filtering, 8 extra pixels were filtered on each edge,
     * but up to 3 of the horizontal ones may be wrong.
       we want to expand border from the last filtered pixel */
    int b_start = !mb_y;
    int width = 16*h->mb.i_mb_width + 8;
    int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16;
    int padh = PADH - 4;
    int padv = PADV - 8;
    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
        for( int i = 1; i < 4; i++ )
        {
            int stride = frame->i_stride[p];
            // buffer: 8 luma, to match the hpel filter
            pixel *pix;
            if( SLICE_MBAFF )
            {
                pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4;
                plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
                plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
            }
            pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4;
            plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
        }
 }
 void x264_frame_expand_border_lowres( x264_frame_t *frame )
 {
    for( int i = 0; i < 4; i++ )
        plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
 }
 void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
 {
    int v_shift = CHROMA_V_SHIFT;
    plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
                         PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT );
 }
 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
 {
    for( int i = 0; i < frame->i_plane; i++ )
    {
        int i_width = h->param.i_width;
        int h_shift = i && CHROMA_H_SHIFT;
        int v_shift = i && CHROMA_V_SHIFT;
        int i_height = h->param.i_height >> v_shift;
        int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
        if( i_padx )
        {
            for( int y = 0; y < i_height; y++ )
                pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
                              i_padx>>h_shift, SIZEOF_PIXEL<<h_shift );
        }
        if( i_pady )
        {
            for( int y = i_height; y < i_height + i_pady; y++ )
                memcpy( &frame->plane[i][y*frame->i_stride[i]],
                        &frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]],
                        (i_width + i_padx) * SIZEOF_PIXEL );
        }
    }
 }
 void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
 {
    for( int i = 0; i < h->fenc->i_plane; i++ )
    {
        int v_shift = i && CHROMA_V_SHIFT;
        int stride = h->fenc->i_stride[i];
        int height = h->param.i_height >> v_shift;
        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
        pixel *fenc = h->fenc->plane[i] + 16*mb_x;
        for( int y = height; y < height + pady; y++ )
            memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*SIZEOF_PIXEL );
    }
 }
 /* threading */
 void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
 {
    x264_pthread_mutex_lock( &frame->mutex );
    frame->i_lines_completed = i_lines_completed;
    x264_pthread_cond_broadcast( &frame->cv );
    x264_pthread_mutex_unlock( &frame->mutex );
 }
 int x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
 {
    int completed;
    x264_pthread_mutex_lock( &frame->mutex );
    while( (completed = frame->i_lines_completed) < i_lines_completed && i_lines_completed >= 0 )
        x264_pthread_cond_wait( &frame->cv, &frame->mutex );
    x264_pthread_mutex_unlock( &frame->mutex );
    return completed;
 }
 void x264_threadslice_cond_broadcast( x264_t *h, int pass )
 {
    x264_pthread_mutex_lock( &h->mutex );
    h->i_threadslice_pass = pass;
    if( pass > 0 )
        x264_pthread_cond_broadcast( &h->cv );
    x264_pthread_mutex_unlock( &h->mutex );
 }
 void x264_threadslice_cond_wait( x264_t *h, int pass )
 {
    x264_pthread_mutex_lock( &h->mutex );
    while( h->i_threadslice_pass < pass )
        x264_pthread_cond_wait( &h->cv, &h->mutex );
    x264_pthread_mutex_unlock( &h->mutex );
 }
 int x264_frame_new_slice( x264_t *h, x264_frame_t *frame )
 {
    if( h->param.i_slice_count_max )
    {
        int slice_count;
        if( h->param.b_sliced_threads )
            slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex );
        else
            slice_count = frame->i_slice_count++;
        if( slice_count >= h->param.i_slice_count_max )
            return -1;
    }
    return 0;
 }
 /* list operators */
 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
 {
    int i = 0;
    while( list[i] ) i++;
    list[i] = frame;
 }
 x264_frame_t *x264_frame_pop( x264_frame_t **list )
 {
    x264_frame_t *frame;
    int i = 0;
    assert( list[0] );
    while( list[i+1] ) i++;
    frame = list[i];
    list[i] = NULL;
    return frame;
 }
 void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame )
 {
    int i = 0;
    while( list[i] ) i++;
    while( i-- )
        list[i+1] = list[i];
    list[0] = frame;
 }
 x264_frame_t *x264_frame_shift( x264_frame_t **list )
 {
    x264_frame_t *frame = list[0];
    int i;
    for( i = 0; list[i]; i++ )
        list[i] = list[i+1];
    assert(frame);
    return frame;
 }
 void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
 {
    assert( frame->i_reference_count > 0 );
    frame->i_reference_count--;
    if( frame->i_reference_count == 0 )
        x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 }
 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 {
    x264_frame_t *frame;
    if( h->frames.unused[b_fdec][0] )
        frame = x264_frame_pop( h->frames.unused[b_fdec] );
    else
        frame = frame_new( h, b_fdec );
    if( !frame )
        return NULL;
    frame->b_last_minigop_bframe = 0;
    frame->i_reference_count = 1;
    frame->b_intra_calculated = 0;
    frame->b_scenecut = 1;
    frame->b_keyframe = 0;
    frame->b_corrupt = 0;
    frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1;
    memset( frame->weight, 0, sizeof(frame->weight) );
    memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
    return frame;
 }
 void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
 {
    assert( frame->i_reference_count > 0 );
    frame->i_reference_count--;
    if( frame->i_reference_count == 0 )
        x264_frame_push( h->frames.blank_unused, frame );
 }
 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
 {
    x264_frame_t *frame;
    if( h->frames.blank_unused[0] )
        frame = x264_frame_pop( h->frames.blank_unused );
    else
        frame = x264_malloc( sizeof(x264_frame_t) );
    if( !frame )
        return NULL;
    frame->b_duplicate = 1;
    frame->i_reference_count = 1;
    return frame;
 }
 void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
                              int i_width, int i_height, x264_weight_t *w )
 {
    /* Weight horizontal strips of height 16. This was found to be the optimal height
     * in terms of the cache loads. */
    while( i_height > 0 )
    {
        int x;
        for( x = 0; x < i_width-8; x += 16 )
            w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
        if( x < i_width )
            w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
        i_height -= 16;
        dst += 16 * i_dst_stride;
        src += 16 * i_src_stride;
    }
 }
 void x264_frame_delete_list( x264_frame_t **list )
 {
    int i = 0;
    if( !list )
        return;
    while( list[i] )
        x264_frame_delete( list[i++] );
    x264_free( list );
 }
 int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int max_size )
 {
    if( max_size < 0 )
        return -1;
    slist->i_max_size = max_size;
    slist->i_size = 0;
    CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
    if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
        x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
        x264_pthread_cond_init( &slist->cv_empty, NULL ) )
        return -1;
    return 0;
 fail:
    return -1;
 }
 void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist )
 {
    x264_pthread_mutex_destroy( &slist->mutex );
    x264_pthread_cond_destroy( &slist->cv_fill );
    x264_pthread_cond_destroy( &slist->cv_empty );
    x264_frame_delete_list( slist->list );
 }
 void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame )
 {
    x264_pthread_mutex_lock( &slist->mutex );
    while( slist->i_size == slist->i_max_size )
        x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
    slist->list[ slist->i_size++ ] = frame;
    x264_pthread_mutex_unlock( &slist->mutex );
    x264_pthread_cond_broadcast( &slist->cv_fill );
 }
 x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist )
 {
    x264_frame_t *frame;
    x264_pthread_mutex_lock( &slist->mutex );
    while( !slist->i_size )
        x264_pthread_cond_wait( &slist->cv_fill, &slist->mutex );
    frame = slist->list[ --slist->i_size ];
    slist->list[ slist->i_size ] = NULL;
    x264_pthread_cond_broadcast( &slist->cv_empty );
    x264_pthread_mutex_unlock( &slist->mutex );
    return frame;
 }
--- a/common/frame.h
+++ b/common/frame.h
@@ -0,0 +1,297 @@
 /*****************************************************************************
 * frame.h: frame handling
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *          Fiona Glaser <fiona@x264.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_FRAME_H
 #define X264_FRAME_H
 /* number of pixels past the edge of the frame, for motion estimation/compensation */
 #define PADH 32
 #define PADV 32
 #define PADH_ALIGN X264_MAX( PADH, NATIVE_ALIGN / SIZEOF_PIXEL )
 #define PADH2 (PADH_ALIGN + PADH)
 typedef struct x264_frame
 {
    /* */
    uint8_t *base;       /* Base pointer for all malloced data in this frame. */
    int     i_poc;
    int     i_delta_poc[2];
    int     i_type;
    int     i_forced_type;
    int     i_qpplus1;
    int64_t i_pts;
    int64_t i_dts;
    int64_t i_reordered_pts;
    int64_t i_duration;  /* in SPS time_scale units (i.e 2 * timebase units) used for vfr */
    float   f_duration;  /* in seconds */
    int64_t i_cpb_duration;
    int64_t i_cpb_delay; /* in SPS time_scale units (i.e 2 * timebase units) */
    int64_t i_dpb_output_delay;
    x264_param_t *param;
    int     i_frame;     /* Presentation frame number */
    int     i_coded;     /* Coded frame number */
    int64_t i_field_cnt; /* Presentation field count */
    int     i_frame_num; /* 7.4.3 frame_num */
    int     b_kept_as_ref;
    int     i_pic_struct;
    int     b_keyframe;
    uint8_t b_fdec;
    uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
    uint8_t i_bframes;   /* number of bframes following this nonb in coded order */
    float   f_qp_avg_rc; /* QPs as decided by ratecontrol */
    float   f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
    float   f_crf_avg;   /* Average effective CRF for this frame */
    int     i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
    /* YUV buffer */
    int     i_csp; /* Internal csp */
    int     i_plane;
    int     i_stride[3];
    int     i_width[3];
    int     i_lines[3];
    int     i_stride_lowres;
    int     i_width_lowres;
    int     i_lines_lowres;
    pixel *plane[3];
    pixel *plane_fld[3];
    pixel *filtered[3][4]; /* plane[0], H, V, HV */
    pixel *filtered_fld[3][4];
    pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
    uint16_t *integral;
    /* for unrestricted mv we allocate more data than needed
     * allocated data are stored in buffer */
    pixel *buffer[4];
    pixel *buffer_fld[4];
    pixel *buffer_lowres;
    x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
    pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */
    int b_duplicate;
    struct x264_frame *orig;
    /* motion data */
    int8_t  *mb_type;
    uint8_t *mb_partition;
    int16_t (*mv[2])[2];
    int16_t (*mv16x16)[2];
    int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
    uint8_t *field;
    uint8_t *effective_qp;
    /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost).
     * Doesn't need special addressing for intra cost because
     * lists_used is guaranteed to be zero in that cast. */
    uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
    #define LOWRES_COST_MASK ((1<<14)-1)
    #define LOWRES_COST_SHIFT 14
    int     *lowres_mv_costs[2][X264_BFRAME_MAX+1];
    int8_t  *ref[2];
    int     i_ref[2];
    int     ref_poc[2][X264_REF_MAX];
    int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction
    /* for adaptive B-frame decision.
     * contains the SATD cost of the lowres frame encoded in various modes
     * FIXME: how big an array do we need? */
    int     i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
    int     i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
    int     i_satd; // the i_cost_est of the selected frametype
    int     i_intra_mbs[X264_BFRAME_MAX+2];
    int     *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
    int     *i_row_satd;
    int     *i_row_bits;
    float   *f_row_qp;
    float   *f_row_qscale;
    float   *f_qp_offset;
    float   *f_qp_offset_aq;
    int     b_intra_calculated;
    uint16_t *i_intra_cost;
    uint16_t *i_propagate_cost;
    uint16_t *i_inv_qscale_factor;
    int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
    float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
    uint32_t i_pixel_sum[3];
    uint64_t i_pixel_ssd[3];
    /* hrd */
    x264_hrd_t hrd_timing;
    /* vbv */
    uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
    int i_planned_satd[X264_LOOKAHEAD_MAX+1];
    double f_planned_cpb_duration[X264_LOOKAHEAD_MAX+1];
    int64_t i_coded_fields_lookahead;
    int64_t i_cpb_delay_lookahead;
    /* threading */
    int     i_lines_completed; /* in pixels */
    int     i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
    int     i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
    x264_pthread_mutex_t mutex;
    x264_pthread_cond_t  cv;
    int     i_slice_count; /* Atomically written to/read from with slice threads */
    /* periodic intra refresh */
    float   f_pir_position;
    int     i_pir_start_col;
    int     i_pir_end_col;
    int     i_frames_since_pir;
    /* interactive encoder control */
    int     b_corrupt;
    /* user sei */
    x264_sei_t extra_sei;
    /* user data */
    void *opaque;
    /* user frame properties */
    uint8_t *mb_info;
    void (*mb_info_free)( void* );
 #if HAVE_OPENCL
    x264_frame_opencl_t opencl;
 #endif
 } x264_frame_t;
 /* synchronized frame list */
 typedef struct
 {
   x264_frame_t **list;
   int i_max_size;
   int i_size;
   x264_pthread_mutex_t     mutex;
   x264_pthread_cond_t      cv_fill;  /* event signaling that the list became fuller */
   x264_pthread_cond_t      cv_empty; /* event signaling that the list became emptier */
 } x264_sync_frame_list_t;
 typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta );
 typedef struct
 {
    x264_deblock_inter_t deblock_luma[2];
    x264_deblock_inter_t deblock_chroma[2];
    x264_deblock_inter_t deblock_h_chroma_420;
    x264_deblock_inter_t deblock_h_chroma_422;
    x264_deblock_intra_t deblock_luma_intra[2];
    x264_deblock_intra_t deblock_chroma_intra[2];
    x264_deblock_intra_t deblock_h_chroma_420_intra;
    x264_deblock_intra_t deblock_h_chroma_422_intra;
    x264_deblock_inter_t deblock_luma_mbaff;
    x264_deblock_inter_t deblock_chroma_mbaff;
    x264_deblock_inter_t deblock_chroma_420_mbaff;
    x264_deblock_inter_t deblock_chroma_422_mbaff;
    x264_deblock_intra_t deblock_luma_intra_mbaff;
    x264_deblock_intra_t deblock_chroma_intra_mbaff;
    x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
    x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
    void (*deblock_strength)( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                              int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
                              int bframe );
 } x264_deblock_function_t;
 #define x264_frame_delete x264_template(frame_delete)
 void          x264_frame_delete( x264_frame_t *frame );
 #define x264_frame_copy_picture x264_template(frame_copy_picture)
 int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
 #define x264_frame_expand_border x264_template(frame_expand_border)
 void          x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y );
 #define x264_frame_expand_border_filtered x264_template(frame_expand_border_filtered)
 void          x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
 #define x264_frame_expand_border_lowres x264_template(frame_expand_border_lowres)
 void          x264_frame_expand_border_lowres( x264_frame_t *frame );
 #define x264_frame_expand_border_chroma x264_template(frame_expand_border_chroma)
 void          x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane );
 #define x264_frame_expand_border_mod16 x264_template(frame_expand_border_mod16)
 void          x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
 #define x264_expand_border_mbpair x264_template(expand_border_mbpair)
 void          x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y );
 #define x264_frame_deblock_row x264_template(frame_deblock_row)
 void          x264_frame_deblock_row( x264_t *h, int mb_y );
 #define x264_macroblock_deblock x264_template(macroblock_deblock)
 void          x264_macroblock_deblock( x264_t *h );
 #define x264_frame_filter x264_template(frame_filter)
 void          x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
 #define x264_frame_init_lowres x264_template(frame_init_lowres)
 void          x264_frame_init_lowres( x264_t *h, x264_frame_t *frame );
 #define x264_deblock_init x264_template(deblock_init)
 void          x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff );
 #define x264_frame_cond_broadcast x264_template(frame_cond_broadcast)
 void          x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed );
 #define x264_frame_cond_wait x264_template(frame_cond_wait)
 int           x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );
 #define x264_frame_new_slice x264_template(frame_new_slice)
 int           x264_frame_new_slice( x264_t *h, x264_frame_t *frame );
 #define x264_threadslice_cond_broadcast x264_template(threadslice_cond_broadcast)
 void          x264_threadslice_cond_broadcast( x264_t *h, int pass );
 #define x264_threadslice_cond_wait x264_template(threadslice_cond_wait)
 void          x264_threadslice_cond_wait( x264_t *h, int pass );
 #define x264_frame_push x264_template(frame_push)
 X264_API void          x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
 #define x264_frame_pop x264_template(frame_pop)
 X264_API x264_frame_t *x264_frame_pop( x264_frame_t **list );
 #define x264_frame_unshift x264_template(frame_unshift)
 X264_API void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
 #define x264_frame_shift x264_template(frame_shift)
 X264_API x264_frame_t *x264_frame_shift( x264_frame_t **list );
 #define x264_frame_push_unused x264_template(frame_push_unused)
 void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
 #define x264_frame_push_blank_unused x264_template(frame_push_blank_unused)
 void          x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
 #define x264_frame_pop_blank_unused x264_template(frame_pop_blank_unused)
 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
 #define x264_weight_scale_plane x264_template(weight_scale_plane)
 void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
                              int i_width, int i_height, x264_weight_t *w );
 #define x264_frame_pop_unused x264_template(frame_pop_unused)
 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
 #define x264_frame_delete_list x264_template(frame_delete_list)
 void          x264_frame_delete_list( x264_frame_t **list );
 #define x264_sync_frame_list_init x264_template(sync_frame_list_init)
 int           x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int nelem );
 #define x264_sync_frame_list_delete x264_template(sync_frame_list_delete)
 void          x264_sync_frame_list_delete( x264_sync_frame_list_t *slist );
 #define x264_sync_frame_list_push x264_template(sync_frame_list_push)
 void          x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame );
 #define x264_sync_frame_list_pop x264_template(sync_frame_list_pop)
 x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist );
 #endif
--- a/common/loongarch/dct-a.S
+++ b/common/loongarch/dct-a.S
--- a/common/loongarch/dct.h
+++ b/common/loongarch/dct.h
@@ -0,0 +1,95 @@
 /*****************************************************************************
 * dct.h: loongarch transform and zigzag
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Peng Zhou <zhoupeng@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_LOONGARCH_DCT_H
 #define X264_LOONGARCH_DCT_H
 #define x264_sub8x8_dct_lasx x264_template(sub8x8_dct_lasx)
 void x264_sub8x8_dct_lasx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
 #define x264_sub16x16_dct_lasx x264_template(sub16x16_dct_lasx)
 void x264_sub16x16_dct_lasx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
 #define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
 void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
 #define x264_sub16x16_dct8_lasx x264_template(sub16x16_dct8_lasx)
 void x264_sub16x16_dct8_lasx( int16_t pi_dct[4][64], uint8_t *p_pix1,
                              uint8_t *p_pix2 );
 #define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
 void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
 #define x264_add8x8_idct_lasx x264_template(add8x8_idct_lasx)
 void x264_add8x8_idct_lasx( uint8_t *p_dst, int16_t pi_dct[4][16] );
 #define x264_add16x16_idct_lasx x264_template(add16x16_idct_lasx)
 void x264_add16x16_idct_lasx( uint8_t *p_dst, int16_t pi_dct[16][16] );
 #define x264_add8x8_idct8_lasx x264_template(add8x8_idct8_lasx)
 void x264_add8x8_idct8_lasx( uint8_t *p_dst, int16_t pi_dct[64] );
 #define x264_add8x8_idct_dc_lasx x264_template(add8x8_idct_dc_lasx)
 void x264_add8x8_idct_dc_lasx( uint8_t *p_dst, int16_t dct[4] );
 #define x264_add16x16_idct_dc_lasx x264_template(add16x16_idct_dc_lasx)
 void x264_add16x16_idct_dc_lasx( uint8_t *p_dst, int16_t dct[16] );
 #define x264_idct4x4dc_lasx x264_template(idct4x4dc_lasx)
 void x264_idct4x4dc_lasx( int16_t d[16] );
 #define x264_dct4x4dc_lasx x264_template(dct4x4dc_lasx)
 void x264_dct4x4dc_lasx( int16_t d[16] );
 #define x264_zigzag_scan_4x4_frame_lasx x264_template(zigzag_scan_4x4_frame_lasx)
 void x264_zigzag_scan_4x4_frame_lasx( int16_t level[16], int16_t dct[16] );
 #define x264_sub4x4_dct_lsx x264_template(sub4x4_dct_lsx)
 void x264_sub4x4_dct_lsx( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
 #define x264_sub8x8_dct_lsx x264_template(sub8x8_dct_lsx)
 void x264_sub8x8_dct_lsx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref );
 #define x264_sub16x16_dct_lsx x264_template(sub16x16_dct_lsx)
 void x264_sub16x16_dct_lsx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref );
 #define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx)
 void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 );
 #define x264_sub16x16_dct8_lsx x264_template(sub16x16_dct8_lsx)
 void x264_sub16x16_dct8_lsx( int16_t pi_dct[4][64], uint8_t *p_pix1,
                              uint8_t *p_pix2 );
 #define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx)
 void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] );
 #define x264_add8x8_idct_lsx x264_template(add8x8_idct_lsx)
 void x264_add8x8_idct_lsx( uint8_t *p_dst, int16_t pi_dct[4][16] );
 #define x264_add16x16_idct_lsx x264_template(add16x16_idct_lsx)
 void x264_add16x16_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16][16] );
 #define x264_add8x8_idct8_lsx x264_template(add8x8_idct8_lsx)
 void x264_add8x8_idct8_lsx( uint8_t *p_dst, int16_t pi_dct[64] );
 #define x264_add8x8_idct_dc_lsx x264_template(add8x8_idct_dc_lsx)
 void x264_add8x8_idct_dc_lsx( uint8_t *p_dst, int16_t dct[4] );
 #define x264_add16x16_idct_dc_lsx x264_template(add16x16_idct_dc_lsx)
 void x264_add16x16_idct_dc_lsx( uint8_t *p_dst, int16_t dct[16] );
 #define x264_idct4x4dc_lsx x264_template(idct4x4dc_lsx)
 void x264_idct4x4dc_lsx( int16_t d[16] );
 #define x264_dct4x4dc_lsx x264_template(dct4x4dc_lsx)
 void x264_dct4x4dc_lsx( int16_t d[16] );
 #define x264_zigzag_scan_4x4_frame_lsx x264_template(zigzag_scan_4x4_frame_lsx)
 void x264_zigzag_scan_4x4_frame_lsx( int16_t level[16], int16_t dct[16] );
 #endif
--- a/common/loongarch/deblock-a.S
+++ b/common/loongarch/deblock-a.S
--- a/common/loongarch/deblock.h
+++ b/common/loongarch/deblock.h
@@ -0,0 +1,54 @@
 /*****************************************************************************
 * deblock.h: loongarch deblock
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Hao Chen <chenhao@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_LOONGARCH_DEBLOCK_H
 #define X264_LOONGARCH_DEBLOCK_H
 #if !HIGH_BIT_DEPTH
 #define x264_deblock_v_luma_lasx x264_template(deblock_v_luma_lasx)
 void x264_deblock_v_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_luma_lasx x264_template(deblock_h_luma_lasx)
 void x264_deblock_h_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_v_luma_intra_lsx x264_template(deblock_v_luma_intra_lsx)
 void x264_deblock_v_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_luma_intra_lsx x264_template(deblock_h_luma_intra_lsx)
 void x264_deblock_h_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_v_luma_intra_lasx x264_template(deblock_v_luma_intra_lasx)
 void x264_deblock_v_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_luma_intra_lasx x264_template(deblock_h_luma_intra_lasx)
 void x264_deblock_h_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_strength_lsx x264_template(deblock_strength_lsx)
 void x264_deblock_strength_lsx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                int mvy_limit, int bframe );
 #define x264_deblock_strength_lasx x264_template(deblock_strength_lasx)
 void x264_deblock_strength_lasx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                 int mvy_limit, int bframe );
 #endif
 #endif
--- a/common/loongarch/loongson_asm.S
+++ b/common/loongarch/loongson_asm.S
@@ -0,0 +1,770 @@
 /*********************************************************************
 * Copyright (c) 2022-2024 Loongson Technology Corporation Limited
 * Contributed by Xiwei Gu <guxiwei-hf@loongson.cn>
 *                Shiyou Yin <yinshiyou-hf@loongson.cn>
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *********************************************************************/
 /*
 * This file is a LoongArch assembly helper file and available under ISC
 * license. It provides a large number of macros and alias to simplify
 * writing assembly code, especially for LSX and LASX optimizations.
 *
 * Any one can modify it or add new features for his/her own purposes.
 * Contributing a patch will be appreciated as it might be useful for
 * others as well. Send patches to loongson contributor mentioned above.
 *
 * MAJOR version: Usage changes, incompatible with previous version.
 * MINOR version: Add new macros/functions, or bug fixes.
 * MICRO version: Comment changes or implementation changes.
 */
 #define LML_VERSION_MAJOR 0
 #define LML_VERSION_MINOR 4
 #define LML_VERSION_MICRO 0
 #define ASM_PREF
 #define DEFAULT_ALIGN    5
 /*
 *============================================================================
 * macros for specific projetc, set them as needed.
 * Following LoongML macros for your reference.
 *============================================================================
 */
 .macro function name, align=DEFAULT_ALIGN
 .macro endfunc
    jirl    $r0, $r1, 0x0
    .size ASM_PREF\name, . - ASM_PREF\name
    .purgem endfunc
 .endm
 .text ;
 .align \align ;
 .globl ASM_PREF\name ;
 .type  ASM_PREF\name, @function ;
 ASM_PREF\name: ;
 .endm
 .macro  const name, align=DEFAULT_ALIGN
    .macro endconst
    .size  \name, . - \name
    .purgem endconst
    .endm
 .section .rodata
 .align   \align
 \name:
 .endm
 /*
 *============================================================================
 * LoongArch register alias
 *============================================================================
 */
 #define a0 $a0
 #define a1 $a1
 #define a2 $a2
 #define a3 $a3
 #define a4 $a4
 #define a5 $a5
 #define a6 $a6
 #define a7 $a7
 #define t0 $t0
 #define t1 $t1
 #define t2 $t2
 #define t3 $t3
 #define t4 $t4
 #define t5 $t5
 #define t6 $t6
 #define t7 $t7
 #define t8 $t8
 #define s0 $s0
 #define s1 $s1
 #define s2 $s2
 #define s3 $s3
 #define s4 $s4
 #define s5 $s5
 #define s6 $s6
 #define s7 $s7
 #define s8 $s8
 #define zero $zero
 #define sp   $sp
 #define ra   $ra
 #define fa0  $fa0
 #define fa1  $fa1
 #define fa2  $fa2
 #define fa3  $fa3
 #define fa4  $fa4
 #define fa5  $fa5
 #define fa6  $fa6
 #define fa7  $fa7
 #define ft0  $ft0
 #define ft1  $ft1
 #define ft2  $ft2
 #define ft3  $ft3
 #define ft4  $ft4
 #define ft5  $ft5
 #define ft6  $ft6
 #define ft7  $ft7
 #define ft8  $ft8
 #define ft9  $ft9
 #define ft10 $ft10
 #define ft11 $ft11
 #define ft12 $ft12
 #define ft13 $ft13
 #define ft14 $ft14
 #define ft15 $ft15
 #define fs0  $fs0
 #define fs1  $fs1
 #define fs2  $fs2
 #define fs3  $fs3
 #define fs4  $fs4
 #define fs5  $fs5
 #define fs6  $fs6
 #define fs7  $fs7
 #define f0  $f0
 #define f1  $f1
 #define f2  $f2
 #define f3  $f3
 #define f4  $f4
 #define f5  $f5
 #define f6  $f6
 #define f7  $f7
 #define f8  $f8
 #define f9  $f9
 #define f10 $f10
 #define f11 $f11
 #define f12 $f12
 #define f13 $f13
 #define f14 $f14
 #define f15 $f15
 #define f16 $f16
 #define f17 $f17
 #define f18 $f18
 #define f19 $f19
 #define f20 $f20
 #define f21 $f21
 #define f22 $f22
 #define f23 $f23
 #define f24 $f24
 #define f25 $f25
 #define f26 $f26
 #define f27 $f27
 #define f28 $f28
 #define f29 $f29
 #define f30 $f30
 #define f31 $f31
 #define vr0 $vr0
 #define vr1 $vr1
 #define vr2 $vr2
 #define vr3 $vr3
 #define vr4 $vr4
 #define vr5 $vr5
 #define vr6 $vr6
 #define vr7 $vr7
 #define vr8 $vr8
 #define vr9 $vr9
 #define vr10 $vr10
 #define vr11 $vr11
 #define vr12 $vr12
 #define vr13 $vr13
 #define vr14 $vr14
 #define vr15 $vr15
 #define vr16 $vr16
 #define vr17 $vr17
 #define vr18 $vr18
 #define vr19 $vr19
 #define vr20 $vr20
 #define vr21 $vr21
 #define vr22 $vr22
 #define vr23 $vr23
 #define vr24 $vr24
 #define vr25 $vr25
 #define vr26 $vr26
 #define vr27 $vr27
 #define vr28 $vr28
 #define vr29 $vr29
 #define vr30 $vr30
 #define vr31 $vr31
 #define xr0 $xr0
 #define xr1 $xr1
 #define xr2 $xr2
 #define xr3 $xr3
 #define xr4 $xr4
 #define xr5 $xr5
 #define xr6 $xr6
 #define xr7 $xr7
 #define xr8 $xr8
 #define xr9 $xr9
 #define xr10 $xr10
 #define xr11 $xr11
 #define xr12 $xr12
 #define xr13 $xr13
 #define xr14 $xr14
 #define xr15 $xr15
 #define xr16 $xr16
 #define xr17 $xr17
 #define xr18 $xr18
 #define xr19 $xr19
 #define xr20 $xr20
 #define xr21 $xr21
 #define xr22 $xr22
 #define xr23 $xr23
 #define xr24 $xr24
 #define xr25 $xr25
 #define xr26 $xr26
 #define xr27 $xr27
 #define xr28 $xr28
 #define xr29 $xr29
 #define xr30 $xr30
 #define xr31 $xr31
 /*
 *============================================================================
 * LSX/LASX synthesize instructions
 *============================================================================
 */
 /*
 * Description : Dot product of byte vector elements
 * Arguments   : Inputs  - vj, vk
 *               Outputs - vd
 *               Return Type - halfword
 */
 .macro vdp2.h.bu vd, vj, vk
    vmulwev.h.bu      \vd,    \vj,    \vk
    vmaddwod.h.bu     \vd,    \vj,    \vk
 .endm
 .macro vdp2.h.bu.b vd, vj, vk
    vmulwev.h.bu.b    \vd,    \vj,    \vk
    vmaddwod.h.bu.b   \vd,    \vj,    \vk
 .endm
 .macro vdp2.w.h vd, vj, vk
    vmulwev.w.h       \vd,    \vj,    \vk
    vmaddwod.w.h      \vd,    \vj,    \vk
 .endm
 .macro xvdp2.h.bu xd, xj, xk
    xvmulwev.h.bu    \xd,    \xj,    \xk
    xvmaddwod.h.bu   \xd,    \xj,    \xk
 .endm
 .macro xvdp2.h.bu.b xd, xj, xk
    xvmulwev.h.bu.b    \xd,  \xj,    \xk
    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
 .endm
 .macro xvdp2.w.h xd, xj, xk
    xvmulwev.w.h       \xd,  \xj,    \xk
    xvmaddwod.w.h      \xd,  \xj,    \xk
 .endm
 /*
 * Description : Dot product & addition of halfword vector elements
 * Arguments   : Inputs  - vj, vk
 *               Outputs - vd
 *               Return Type - twice size of input
 */
 .macro vdp2add.h.bu vd, vj, vk
    vmaddwev.h.bu     \vd,    \vj,    \vk
    vmaddwod.h.bu     \vd,    \vj,    \vk
 .endm
 .macro vdp2add.h.bu.b vd, vj, vk
    vmaddwev.h.bu.b   \vd,    \vj,    \vk
    vmaddwod.h.bu.b   \vd,    \vj,    \vk
 .endm
 .macro vdp2add.w.h vd, vj, vk
    vmaddwev.w.h      \vd,    \vj,    \vk
    vmaddwod.w.h      \vd,    \vj,    \vk
 .endm
 .macro xvdp2add.h.bu.b xd, xj, xk
    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
 .endm
 .macro xvdp2add.w.h xd, xj, xk
    xvmaddwev.w.h      \xd,  \xj,    \xk
    xvmaddwod.w.h      \xd,  \xj,    \xk
 .endm
 /*
 * Description : Range element vj[i] to vk[i] ~ vj[i]
 * clip: vj > vk ? vj : vk && vj < va ? vj : va
 */
 .macro vclip.h  vd,  vj, vk, va
    vmax.h    \vd,  \vj,   \vk
    vmin.h    \vd,  \vd,   \va
 .endm
 .macro vclip.w  vd,  vj, vk, va
    vmax.w    \vd,  \vj,   \vk
    vmin.w    \vd,  \vd,   \va
 .endm
 .macro xvclip.h  xd,  xj, xk, xa
    xvmax.h    \xd,  \xj,   \xk
    xvmin.h    \xd,  \xd,   \xa
 .endm
 .macro xvclip.w  xd,  xj, xk, xa
    xvmax.w    \xd,  \xj,   \xk
    xvmin.w    \xd,  \xd,   \xa
 .endm
 /*
 * Description : Range element vj[i] to 0 ~ 255
 * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
 */
 .macro vclip255.h  vd, vj
    vmaxi.h   \vd,   \vj,  0
    vsat.hu   \vd,   \vd,  7
 .endm
 .macro vclip255.w  vd, vj
    vmaxi.w   \vd,   \vj,  0
    vsat.wu   \vd,   \vd,  7
 .endm
 .macro xvclip255.h  xd, xj
    xvmaxi.h   \xd,   \xj,  0
    xvsat.hu   \xd,   \xd,  7
 .endm
 .macro xvclip255.w  xd, xj
    xvmaxi.w   \xd,   \xj,  0
    xvsat.wu   \xd,   \xd,  7
 .endm
 /*
 * Description : Store elements of vector
 * vd : Data vector to be stroed
 * rk : Address of data storage
 * ra : Offset of address
 * si : Index of data in vd
 */
 .macro vstelmx.b vd, rk, ra, si
    add.d      \rk,  \rk,  \ra
    vstelm.b   \vd,  \rk,  0, \si
 .endm
 .macro vstelmx.h vd, rk, ra, si
    add.d      \rk,  \rk,  \ra
    vstelm.h   \vd,  \rk,  0, \si
 .endm
 .macro vstelmx.w vd, rk, ra, si
    add.d      \rk,  \rk,  \ra
    vstelm.w   \vd,  \rk,  0, \si
 .endm
 .macro vstelmx.d  vd, rk, ra, si
    add.d      \rk,  \rk,  \ra
    vstelm.d   \vd,  \rk,  0, \si
 .endm
 .macro vmov xd, xj
    vor.v  \xd,  \xj,  \xj
 .endm
 .macro xmov xd, xj
    xvor.v  \xd,  \xj,  \xj
 .endm
 .macro xvstelmx.d  xd, rk, ra, si
    add.d      \rk, \rk,  \ra
    xvstelm.d  \xd, \rk,  0, \si
 .endm
 /*
 *============================================================================
 * LSX/LASX custom macros
 *============================================================================
 */
 /*
 * Load 4 float, double, V128, v256 elements with stride.
 */
 .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    fld.s     \out0,    \src,    0
    fldx.s    \out1,    \src,    \stride
    fldx.s    \out2,    \src,    \stride2
    fldx.s    \out3,    \src,    \stride3
 .endm
 .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    fld.d     \out0,    \src,    0
    fldx.d    \out1,    \src,    \stride
    fldx.d    \out2,    \src,    \stride2
    fldx.d    \out3,    \src,    \stride3
 .endm
 .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    vld     \out0,    \src,    0
    vldx    \out1,    \src,    \stride
    vldx    \out2,    \src,    \stride2
    vldx    \out3,    \src,    \stride3
 .endm
 .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    xvld    \out0,    \src,    0
    xvldx   \out1,    \src,    \stride
    xvldx   \out2,    \src,    \stride2
    xvldx   \out3,    \src,    \stride3
 .endm
 /*
 * Description : Transpose 4x4 block with half-word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 */
 .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
                          tmp0, tmp1
    vilvl.h   \tmp0,  \in1,   \in0
    vilvl.h   \tmp1,  \in3,   \in2
    vilvl.w   \out0,  \tmp1,  \tmp0
    vilvh.w   \out2,  \tmp1,  \tmp0
    vilvh.d   \out1,  \out0,  \out0
    vilvh.d   \out3,  \out0,  \out2
 .endm
 /*
 * Description : Transpose 4x4 block with word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 * Details     :
 * Example     :
 *               1, 2, 3, 4            1, 5, 9,13
 *               5, 6, 7, 8    to      2, 6,10,14
 *               9,10,11,12  =====>    3, 7,11,15
 *              13,14,15,16            4, 8,12,16
 */
 .macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
                          tmp0, tmp1
    vilvl.w    \tmp0,   \in1,    \in0
    vilvh.w    \out1,   \in1,    \in0
    vilvl.w    \tmp1,   \in3,    \in2
    vilvh.w    \out3,   \in3,    \in2
    vilvl.d    \out0,   \tmp1,   \tmp0
    vilvl.d    \out2,   \out3,   \out1
    vilvh.d    \out3,   \out3,   \out1
    vilvh.d    \out1,   \tmp1,   \tmp0
 .endm
 /*
 * Description : Transpose 8x8 block with half-word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
 */
 .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
                          tmp3, tmp4, tmp5, tmp6, tmp7
    vilvl.h      \tmp0,    \in6,   \in4
    vilvl.h      \tmp1,    \in7,   \in5
    vilvl.h      \tmp2,    \in2,   \in0
    vilvl.h      \tmp3,    \in3,   \in1
    vilvl.h      \tmp4,    \tmp1,  \tmp0
    vilvh.h      \tmp5,    \tmp1,  \tmp0
    vilvl.h      \tmp6,    \tmp3,  \tmp2
    vilvh.h      \tmp7,    \tmp3,  \tmp2
    vilvh.h      \tmp0,    \in6,   \in4
    vilvh.h      \tmp1,    \in7,   \in5
    vilvh.h      \tmp2,    \in2,   \in0
    vilvh.h      \tmp3,    \in3,   \in1
    vpickev.d    \out0,    \tmp4,  \tmp6
    vpickod.d    \out1,    \tmp4,  \tmp6
    vpickev.d    \out2,    \tmp5,  \tmp7
    vpickod.d    \out3,    \tmp5,  \tmp7
    vilvl.h      \tmp4,    \tmp1,  \tmp0
    vilvh.h      \tmp5,    \tmp1,  \tmp0
    vilvl.h      \tmp6,    \tmp3,  \tmp2
    vilvh.h      \tmp7,    \tmp3,  \tmp2
    vpickev.d    \out4,    \tmp4,  \tmp6
    vpickod.d    \out5,    \tmp4,  \tmp6
    vpickev.d    \out6,    \tmp5,  \tmp7
    vpickod.d    \out7,    \tmp5,  \tmp7
 .endm
 /*
 * Description : Transpose 16x8 block with byte elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
 */
 .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
                            in8, in9, in10, in11, in12, in13, in14, in15,  \
                            out0, out1, out2, out3, out4, out5, out6, out7,\
                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
    xvilvl.b   \tmp0,    \in2,     \in0
    xvilvl.b   \tmp1,    \in3,     \in1
    xvilvl.b   \tmp2,    \in6,     \in4
    xvilvl.b   \tmp3,    \in7,     \in5
    xvilvl.b   \tmp4,    \in10,    \in8
    xvilvl.b   \tmp5,    \in11,    \in9
    xvilvl.b   \tmp6,    \in14,    \in12
    xvilvl.b   \tmp7,    \in15,    \in13
    xvilvl.b   \out0,    \tmp1,    \tmp0
    xvilvh.b   \out1,    \tmp1,    \tmp0
    xvilvl.b   \out2,    \tmp3,    \tmp2
    xvilvh.b   \out3,    \tmp3,    \tmp2
    xvilvl.b   \out4,    \tmp5,    \tmp4
    xvilvh.b   \out5,    \tmp5,    \tmp4
    xvilvl.b   \out6,    \tmp7,    \tmp6
    xvilvh.b   \out7,    \tmp7,    \tmp6
    xvilvl.w   \tmp0,    \out2,    \out0
    xvilvh.w   \tmp2,    \out2,    \out0
    xvilvl.w   \tmp4,    \out3,    \out1
    xvilvh.w   \tmp6,    \out3,    \out1
    xvilvl.w   \tmp1,    \out6,    \out4
    xvilvh.w   \tmp3,    \out6,    \out4
    xvilvl.w   \tmp5,    \out7,    \out5
    xvilvh.w   \tmp7,    \out7,    \out5
    xvilvl.d   \out0,    \tmp1,    \tmp0
    xvilvh.d   \out1,    \tmp1,    \tmp0
    xvilvl.d   \out2,    \tmp3,    \tmp2
    xvilvh.d   \out3,    \tmp3,    \tmp2
    xvilvl.d   \out4,    \tmp5,    \tmp4
    xvilvh.d   \out5,    \tmp5,    \tmp4
    xvilvl.d   \out6,    \tmp7,    \tmp6
    xvilvh.d   \out7,    \tmp7,    \tmp6
 .endm
 /*
 * Description : Transpose 4x4 block with half-word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 */
 .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
                           tmp0, tmp1
    xvilvl.h   \tmp0,  \in1,   \in0
    xvilvl.h   \tmp1,  \in3,   \in2
    xvilvl.w   \out0,  \tmp1,  \tmp0
    xvilvh.w   \out2,  \tmp1,  \tmp0
    xvilvh.d   \out1,  \out0,  \out0
    xvilvh.d   \out3,  \out0,  \out2
 .endm
 /*
 * Description : Transpose 4x8 block with half-word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 */
 .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
                           tmp0, tmp1
    xvilvl.h      \tmp0,    \in2,   \in0
    xvilvl.h      \tmp1,    \in3,   \in1
    xvilvl.h      \out2,    \tmp1,  \tmp0
    xvilvh.h      \out3,    \tmp1,  \tmp0
    xvilvl.d      \out0,    \out2,  \out2
    xvilvh.d      \out1,    \out2,  \out2
    xvilvl.d      \out2,    \out3,  \out3
    xvilvh.d      \out3,    \out3,  \out3
 .endm
 /*
 * Description : Transpose 8x8 block with half-word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
 */
 .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
                           out0, out1, out2, out3, out4, out5, out6, out7, \
                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
    xvilvl.h     \tmp0,   \in6,     \in4
    xvilvl.h     \tmp1,   \in7,     \in5
    xvilvl.h     \tmp2,   \in2,     \in0
    xvilvl.h     \tmp3,   \in3,     \in1
    xvilvl.h     \tmp4,   \tmp1,    \tmp0
    xvilvh.h     \tmp5,   \tmp1,    \tmp0
    xvilvl.h     \tmp6,   \tmp3,    \tmp2
    xvilvh.h     \tmp7,   \tmp3,    \tmp2
    xvilvh.h     \tmp0,   \in6,     \in4
    xvilvh.h     \tmp1,   \in7,     \in5
    xvilvh.h     \tmp2,   \in2,     \in0
    xvilvh.h     \tmp3,   \in3,     \in1
    xvpickev.d   \out0,   \tmp4,    \tmp6
    xvpickod.d   \out1,   \tmp4,    \tmp6
    xvpickev.d   \out2,   \tmp5,    \tmp7
    xvpickod.d   \out3,   \tmp5,    \tmp7
    xvilvl.h     \tmp4,   \tmp1,    \tmp0
    xvilvh.h     \tmp5,   \tmp1,    \tmp0
    xvilvl.h     \tmp6,   \tmp3,    \tmp2
    xvilvh.h     \tmp7,   \tmp3,    \tmp2
    xvpickev.d   \out4,   \tmp4,    \tmp6
    xvpickod.d   \out5,   \tmp4,    \tmp6
    xvpickev.d   \out6,   \tmp5,    \tmp7
    xvpickod.d   \out7,   \tmp5,    \tmp7
 .endm
 /*
 * Description : Transpose 2x4x4 block with half-word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 */
 .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
                             tmp0, tmp1, tmp2
    xvilvh.h   \tmp1,    \in0,     \in1
    xvilvl.h   \out1,    \in0,     \in1
    xvilvh.h   \tmp0,    \in2,     \in3
    xvilvl.h   \out3,    \in2,     \in3
    xvilvh.w   \tmp2,    \out3,    \out1
    xvilvl.w   \out3,    \out3,    \out1
    xvilvl.w   \out2,    \tmp0,    \tmp1
    xvilvh.w   \tmp1,    \tmp0,    \tmp1
    xvilvh.d   \out0,    \out2,    \out3
    xvilvl.d   \out2,    \out2,    \out3
    xvilvh.d   \out1,    \tmp1,    \tmp2
    xvilvl.d   \out3,    \tmp1,    \tmp2
 .endm
 /*
 * Description : Transpose 4x4 block with word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 * Details     :
 * Example     :
 *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
 *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
 *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
 *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
 */
 .macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
                           tmp0, tmp1
    xvilvl.w    \tmp0,   \in1,    \in0
    xvilvh.w    \out1,   \in1,    \in0
    xvilvl.w    \tmp1,   \in3,    \in2
    xvilvh.w    \out3,   \in3,    \in2
    xvilvl.d    \out0,   \tmp1,   \tmp0
    xvilvl.d    \out2,   \out3,   \out1
    xvilvh.d    \out3,   \out3,   \out1
    xvilvh.d    \out1,   \tmp1,   \tmp0
 .endm
 /*
 * Description : Transpose 8x8 block with word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
 *               Outputs - out0, out1, out2, out3, out4, out5, out6,
 *               _out7
 * Example     : LASX_TRANSPOSE8x8_W
 *         in0 : 1,2,3,4,5,6,7,8
 *         in1 : 2,2,3,4,5,6,7,8
 *         in2 : 3,2,3,4,5,6,7,8
 *         in3 : 4,2,3,4,5,6,7,8
 *         in4 : 5,2,3,4,5,6,7,8
 *         in5 : 6,2,3,4,5,6,7,8
 *         in6 : 7,2,3,4,5,6,7,8
 *         in7 : 8,2,3,4,5,6,7,8
 *
 *        out0 : 1,2,3,4,5,6,7,8
 *        out1 : 2,2,2,2,2,2,2,2
 *        out2 : 3,3,3,3,3,3,3,3
 *        out3 : 4,4,4,4,4,4,4,4
 *        out4 : 5,5,5,5,5,5,5,5
 *        out5 : 6,6,6,6,6,6,6,6
 *        out6 : 7,7,7,7,7,7,7,7
 *        out7 : 8,8,8,8,8,8,8,8
 */
 .macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
                           out0, out1, out2, out3, out4, out5, out6, out7,\
                           tmp0, tmp1, tmp2, tmp3
    xvilvl.w    \tmp0,   \in2,    \in0
    xvilvl.w    \tmp1,   \in3,    \in1
    xvilvh.w    \tmp2,   \in2,    \in0
    xvilvh.w    \tmp3,   \in3,    \in1
    xvilvl.w    \out0,   \tmp1,   \tmp0
    xvilvh.w    \out1,   \tmp1,   \tmp0
    xvilvl.w    \out2,   \tmp3,   \tmp2
    xvilvh.w    \out3,   \tmp3,   \tmp2
    xvilvl.w    \tmp0,   \in6,    \in4
    xvilvl.w    \tmp1,   \in7,    \in5
    xvilvh.w    \tmp2,   \in6,    \in4
    xvilvh.w    \tmp3,   \in7,    \in5
    xvilvl.w    \out4,   \tmp1,   \tmp0
    xvilvh.w    \out5,   \tmp1,   \tmp0
    xvilvl.w    \out6,   \tmp3,   \tmp2
    xvilvh.w    \out7,   \tmp3,   \tmp2
    xmov        \tmp0,   \out0
    xmov        \tmp1,   \out1
    xmov        \tmp2,   \out2
    xmov        \tmp3,   \out3
    xvpermi.q   \out0,   \out4,   0x02
    xvpermi.q   \out1,   \out5,   0x02
    xvpermi.q   \out2,   \out6,   0x02
    xvpermi.q   \out3,   \out7,   0x02
    xvpermi.q   \out4,   \tmp0,   0x31
    xvpermi.q   \out5,   \tmp1,   0x31
    xvpermi.q   \out6,   \tmp2,   0x31
    xvpermi.q   \out7,   \tmp3,   0x31
 .endm
 /*
 * Description : Transpose 4x4 block with double-word elements in vectors
 * Arguments   : Inputs  - in0, in1, in2, in3
 *               Outputs - out0, out1, out2, out3
 * Example     : LASX_TRANSPOSE4x4_D
 *         in0 : 1,2,3,4
 *         in1 : 1,2,3,4
 *         in2 : 1,2,3,4
 *         in3 : 1,2,3,4
 *
 *        out0 : 1,1,1,1
 *        out1 : 2,2,2,2
 *        out2 : 3,3,3,3
 *        out3 : 4,4,4,4
 */
 .macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
                           tmp0, tmp1
    xvilvl.d    \tmp0,   \in1,    \in0
    xvilvh.d    \out1,   \in1,    \in0
    xvilvh.d    \tmp1,   \in3,    \in2
    xvilvl.d    \out2,   \in3,    \in2
    xvor.v      \out0,   \tmp0,   \tmp0
    xvor.v      \out3,   \tmp1,   \tmp1
    xvpermi.q   \out0,   \out2,   0x02
    xvpermi.q   \out2,   \tmp0,   0x31
    xvpermi.q   \out3,   \out1,   0x31
    xvpermi.q   \out1,   \tmp1,   0x02
 .endm
--- a/common/loongarch/loongson_util.S
+++ b/common/loongarch/loongson_util.S
@@ -0,0 +1,47 @@
 /*****************************************************************************
 * loongson_util.S: loongson utility macros
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
 *          Xiwei Gu <guxiwei-hf@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #define GLUE(a, b) a ## b
 #define JOIN(a, b) GLUE(a, b)
 /* Set prefix as needed. */
 #define  ASM_REF  JOIN(JOIN(x264_, BIT_DEPTH), _)
 #define FENC_STRIDE      16
 #define FDEC_STRIDE      32
 .macro function_x264 name, align=DEFAULT_ALIGN
 .macro endfunc_x264
    jirl    $r0, $r1, 0x0
    .size ASM_REF\name, . - ASM_REF\name
    .purgem endfunc_x264
 .endm
 .text ;
 .align \align ;
 .globl ASM_REF\name ;
 .type  ASM_REF\name, @function ;
 ASM_REF\name: ;
 .endm
--- a/common/loongarch/mc-a.S
+++ b/common/loongarch/mc-a.S
--- a/common/loongarch/mc-c.c
+++ b/common/loongarch/mc-c.c
@@ -0,0 +1,406 @@
 /*****************************************************************************
 * mc-c.c: loongarch motion compensation
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "mc.h"
 #if !HIGH_BIT_DEPTH
 #define MC_WEIGHT_LSX(func)                                                                                        \
 static void (* mc##func##_wtab_lsx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =   \
 {                                                                                                                  \
    x264_mc_weight_w4##func##_lsx,                                                                                 \
    x264_mc_weight_w4##func##_lsx,                                                                                 \
    x264_mc_weight_w8##func##_lsx,                                                                                 \
    x264_mc_weight_w16##func##_lsx,                                                                                \
    x264_mc_weight_w16##func##_lsx,                                                                                \
    x264_mc_weight_w20##func##_lsx,                                                                                \
 };
 #define MC_WEIGHT(func)                                                                                             \
 static void (* mc##func##_wtab_lasx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =   \
 {                                                                                                                   \
    x264_mc_weight_w4##func##_lasx,                                                                                 \
    x264_mc_weight_w4##func##_lasx,                                                                                 \
    x264_mc_weight_w8##func##_lasx,                                                                                 \
    x264_mc_weight_w16##func##_lasx,                                                                                \
    x264_mc_weight_w16##func##_lasx,                                                                                \
    x264_mc_weight_w20##func##_lasx,                                                                                \
 };
 #if !HIGH_BIT_DEPTH
 MC_WEIGHT_LSX()
 MC_WEIGHT_LSX(_noden)
 MC_WEIGHT()
 MC_WEIGHT(_noden)
 #endif
 static void weight_cache_lsx( x264_t *h, x264_weight_t *w )
 {
    if ( w->i_denom >= 1)
    {
        w->weightfn = mc_wtab_lsx;
    }
    else
        w->weightfn = mc_noden_wtab_lsx;
 }
 static weight_fn_t mc_weight_wtab_lsx[6] =
 {
    x264_mc_weight_w4_lsx,
    x264_mc_weight_w4_lsx,
    x264_mc_weight_w8_lsx,
    x264_mc_weight_w16_lsx,
    x264_mc_weight_w16_lsx,
    x264_mc_weight_w20_lsx,
 };
 static void (* const pixel_avg_wtab_lsx[6])(uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
 {
    NULL,
    x264_pixel_avg2_w4_lsx,
    x264_pixel_avg2_w8_lsx,
    x264_pixel_avg2_w16_lsx,
    x264_pixel_avg2_w16_lsx,
    x264_pixel_avg2_w20_lsx,
 };
 static void (* const mc_copy_wtab_lsx[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
 {
    NULL,
    x264_mc_copy_w4_lsx,
    x264_mc_copy_w8_lsx,
    NULL,
    x264_mc_copy_w16_lsx,
 };
 static void weight_cache_lasx( x264_t *h, x264_weight_t *w )
 {
    if ( w->i_denom >= 1)
    {
        w->weightfn = mc_wtab_lasx;
    }
    else
        w->weightfn = mc_noden_wtab_lasx;
 }
 static weight_fn_t mc_weight_wtab_lasx[6] =
 {
    x264_mc_weight_w4_lasx,
    x264_mc_weight_w4_lasx,
    x264_mc_weight_w8_lasx,
    x264_mc_weight_w16_lasx,
    x264_mc_weight_w16_lasx,
    x264_mc_weight_w20_lasx,
 };
 static void (* const pixel_avg_wtab_lasx[6])(uint8_t *, intptr_t, uint8_t *,
                                             intptr_t, uint8_t *, int ) =
 {
    NULL,
    x264_pixel_avg2_w4_lasx,
    x264_pixel_avg2_w8_lasx,
    x264_pixel_avg2_w16_lasx,
    x264_pixel_avg2_w16_lasx,
    x264_pixel_avg2_w20_lasx,
 };
 static void (* const mc_copy_wtab_lasx[5])( uint8_t *, intptr_t, uint8_t *,
                                            intptr_t, int ) =
 {
    NULL,
    x264_mc_copy_w4_lasx,
    x264_mc_copy_w8_lasx,
    NULL,
    x264_mc_copy_w16_lasx,
 };
 static uint8_t *get_ref_lsx( uint8_t *p_dst, intptr_t *p_dst_stride,
                             uint8_t *p_src[4], intptr_t i_src_stride,
                             int32_t m_vx, int32_t m_vy,
                             int32_t i_width, int32_t i_height,
                             const x264_weight_t *pWeight )
 {
    int32_t i_qpel_idx;
    int32_t i_offset;
    uint8_t *p_src1;
    int32_t r_vy = m_vy & 3;
    int32_t r_vx = m_vx & 3;
    int32_t width = i_width >> 2;
    i_qpel_idx = ( r_vy << 2 ) + r_vx;
    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
           ( 3 == r_vy ) * i_src_stride;
    if( i_qpel_idx & 5 )
    {
        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
                          i_offset + ( 3 == r_vx );
        pixel_avg_wtab_lsx[width](
                p_dst, *p_dst_stride, p_src1, i_src_stride,
                p_src2, i_height );
        if( pWeight->weightfn )
        {
            pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height);
        }
        return p_dst;
    }
    else if ( pWeight->weightfn )
    {
        pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height );
        return p_dst;
    }
    else
    {
        *p_dst_stride = i_src_stride;
        return p_src1;
    }
 }
 static void mc_luma_lsx( uint8_t *p_dst, intptr_t i_dst_stride,
                         uint8_t *p_src[4], intptr_t i_src_stride,
                         int32_t m_vx, int32_t m_vy,
                         int32_t i_width, int32_t i_height,
                         const x264_weight_t *pWeight )
 {
    int32_t  i_qpel_idx;
    int32_t  i_offset;
    uint8_t  *p_src1;
    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
             ( 3 == ( m_vy & 3 ) ) * i_src_stride;
    if( i_qpel_idx & 5 )
    {
        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
                          i_offset + ( 3 == ( m_vx & 3 ) );
        pixel_avg_wtab_lsx[i_width >> 2](
                p_dst, i_dst_stride, p_src1, i_src_stride,
                p_src2, i_height );
        if( pWeight->weightfn )
        {
            pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height );
        }
    }
    else if( pWeight->weightfn )
    {
        pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height );
    }
    else
    {
        mc_copy_wtab_lsx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height );
    }
 }
 PLANE_INTERLEAVE(lsx)
 PLANE_COPY_YUYV(32, lsx)
 #define x264_mc_chroma_lsx x264_template(mc_chroma_lsx)
 void x264_mc_chroma_lsx( uint8_t *p_dst_u, uint8_t *p_dst_v,
                         intptr_t i_dst_stride,
                         uint8_t *p_src, intptr_t i_src_stride,
                         int32_t m_vx, int32_t m_vy,
                         int32_t i_width, int32_t i_height );
 static uint8_t *get_ref_lasx( uint8_t *p_dst, intptr_t *p_dst_stride,
                              uint8_t *p_src[4], intptr_t i_src_stride,
                              int32_t m_vx, int32_t m_vy,
                              int32_t i_width, int32_t i_height,
                              const x264_weight_t *pWeight )
 {
    int32_t i_qpel_idx;
    int32_t i_offset;
    uint8_t *p_src1;
    int32_t r_vy = m_vy & 3;
    int32_t r_vx = m_vx & 3;
    int32_t width = i_width >> 2;
    i_qpel_idx = ( r_vy << 2 ) + r_vx;
    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
           ( 3 == r_vy ) * i_src_stride;
    if( i_qpel_idx & 5 )
    {
        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
                          i_offset + ( 3 == r_vx );
        pixel_avg_wtab_lasx[width](
                p_dst, *p_dst_stride, p_src1, i_src_stride,
                p_src2, i_height );
        if( pWeight->weightfn )
        {
            pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height);
        }
        return p_dst;
    }
    else if ( pWeight->weightfn )
    {
        pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height );
        return p_dst;
    }
    else
    {
        *p_dst_stride = i_src_stride;
        return p_src1;
    }
 }
 static void mc_luma_lasx( uint8_t *p_dst, intptr_t i_dst_stride,
                          uint8_t *p_src[4], intptr_t i_src_stride,
                          int32_t m_vx, int32_t m_vy,
                          int32_t i_width, int32_t i_height,
                          const x264_weight_t *pWeight )
 {
    int32_t  i_qpel_idx;
    int32_t  i_offset;
    uint8_t  *p_src1;
    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
             ( 3 == ( m_vy & 3 ) ) * i_src_stride;
    if( i_qpel_idx & 5 )
    {
        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
                          i_offset + ( 3 == ( m_vx & 3 ) );
        pixel_avg_wtab_lasx[i_width >> 2](
                p_dst, i_dst_stride, p_src1, i_src_stride,
                p_src2, i_height );
        if( pWeight->weightfn )
        {
            pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height );
        }
    }
    else if( pWeight->weightfn )
    {
        pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height );
    }
    else
    {
        mc_copy_wtab_lasx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height );
    }
 }
 PLANE_COPY_YUYV(64, lasx)
 #define x264_mc_chroma_lasx x264_template(mc_chroma_lasx)
 void x264_mc_chroma_lasx( uint8_t *p_dst_u, uint8_t *p_dst_v,
                          intptr_t i_dst_stride,
                          uint8_t *p_src, intptr_t i_src_stride,
                          int32_t m_vx, int32_t m_vy,
                          int32_t i_width, int32_t i_height );
 #endif // !HIGH_BIT_DEPTH
 void x264_mc_init_loongarch( int32_t cpu, x264_mc_functions_t *pf  )
 {
 #if !HIGH_BIT_DEPTH
    if( cpu & X264_CPU_LSX )
    {
        pf->mc_luma = mc_luma_lsx;
        pf->mc_chroma = x264_mc_chroma_lsx;
        pf->get_ref = get_ref_lsx;
        pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_lsx;
        pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lsx;
        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lsx;
        pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lsx;
        pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lsx;
        pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lsx;
        pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lsx;
        pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lsx;
        pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lsx;
        pf->weight = mc_weight_wtab_lsx;
        pf->offsetadd = mc_weight_wtab_lsx;
        pf->offsetsub = mc_weight_wtab_lsx;
        pf->weight_cache = weight_cache_lsx;
        pf->copy_16x16_unaligned = x264_mc_copy_w16_lsx;
        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lsx;
        pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lsx;
        pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lsx;
        pf->store_interleave_chroma = x264_store_interleave_chroma_lsx;
        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_lsx;
        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_lsx;
        pf->plane_copy_interleave = plane_copy_interleave_lsx;
        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lsx;
        pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lsx;
        pf->hpel_filter = x264_hpel_filter_lsx;
        pf->memcpy_aligned = x264_memcpy_aligned_lsx;
        pf->memzero_aligned = x264_memzero_aligned_lsx;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_lsx;
        pf->prefetch_fenc_420 = x264_prefetch_fenc_420_lsx;
        pf->prefetch_fenc_422 = x264_prefetch_fenc_422_lsx;
        pf->prefetch_ref  = x264_prefetch_ref_lsx;
    }
    if( cpu & X264_CPU_LASX )
    {
        pf->mc_luma = mc_luma_lasx;
        pf->mc_chroma = x264_mc_chroma_lasx;
        pf->get_ref = get_ref_lasx;
        pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lasx;
        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lasx;
        pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lasx;
        pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lasx;
        pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lasx;
        pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lasx;
        pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lasx;
        pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lasx;
        pf->weight = mc_weight_wtab_lasx;
        pf->offsetadd = mc_weight_wtab_lasx;
        pf->offsetsub = mc_weight_wtab_lasx;
        pf->weight_cache = weight_cache_lasx;
        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lasx;
        pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lasx;
        pf->copy_16x16_unaligned = x264_mc_copy_w16_lasx;
        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lasx;
        pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lasx;
        pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lasx;
        pf->hpel_filter = x264_hpel_filter_lasx;
        pf->memzero_aligned = x264_memzero_aligned_lasx;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_lasx;
    }
 #endif // !HIGH_BIT_DEPTH
 }
--- a/common/loongarch/mc.h
+++ b/common/loongarch/mc.h
@@ -0,0 +1,196 @@
 /*****************************************************************************
 * mc.h: loongarch motion compensation
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_LOONGARCH_MC_H
 #define X264_LOONGARCH_MC_H
 #define x264_mc_init_loongarch x264_template(mc_init_loongarch)
 void x264_mc_init_loongarch( int cpu, x264_mc_functions_t *pf );
 #define x264_pixel_avg_16x16_lsx x264_template(pixel_avg_16x16_lsx)
 void x264_pixel_avg_16x16_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_16x8_lsx x264_template(pixel_avg_16x8_lsx)
 void x264_pixel_avg_16x8_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_8x16_lsx x264_template(pixel_avg_8x16_lsx)
 void x264_pixel_avg_8x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_8x8_lsx x264_template(pixel_avg_8x8_lsx)
 void x264_pixel_avg_8x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_8x4_lsx x264_template(pixel_avg_8x4_lsx)
 void x264_pixel_avg_8x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x16_lsx x264_template(pixel_avg_4x16_lsx)
 void x264_pixel_avg_4x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x8_lsx x264_template(pixel_avg_4x8_lsx)
 void x264_pixel_avg_4x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x4_lsx x264_template(pixel_avg_4x4_lsx)
 void x264_pixel_avg_4x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x2_lsx x264_template(pixel_avg_4x2_lsx)
 void x264_pixel_avg_4x2_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg2_w4_lsx x264_template(pixel_avg2_w4_lsx)
 void x264_pixel_avg2_w4_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w8_lsx x264_template(pixel_avg2_w8_lsx)
 void x264_pixel_avg2_w8_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w16_lsx x264_template(pixel_avg2_w16_lsx)
 void x264_pixel_avg2_w16_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w20_lsx x264_template(pixel_avg2_w20_lsx)
 void x264_pixel_avg2_w20_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_mc_weight_w20_lsx x264_template(mc_weight_w20_lsx)
 void x264_mc_weight_w20_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w20_noden_lsx x264_template(mc_weight_w20_noden_lsx)
 void x264_mc_weight_w20_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w16_lsx x264_template(mc_weight_w16_lsx)
 void x264_mc_weight_w16_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w16_noden_lsx x264_template(mc_weight_w16_noden_lsx)
 void x264_mc_weight_w16_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w8_lsx x264_template(mc_weight_w8_lsx)
 void x264_mc_weight_w8_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w8_noden_lsx x264_template(mc_weight_w8_noden_lsx)
 void x264_mc_weight_w8_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w4_lsx x264_template(mc_weight_w4_lsx)
 void x264_mc_weight_w4_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w4_noden_lsx x264_template(mc_weight_w4_noden_lsx)
 void x264_mc_weight_w4_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_copy_w16_lsx x264_template(mc_copy_w16_lsx)
 void x264_mc_copy_w16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_copy_w8_lsx x264_template(mc_copy_w8_lsx)
 void x264_mc_copy_w8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_copy_w4_lsx x264_template(mc_copy_w4_lsx)
 void x264_mc_copy_w4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_store_interleave_chroma_lsx x264_template(store_interleave_chroma_lsx)
 void x264_store_interleave_chroma_lsx( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 #define x264_load_deinterleave_chroma_fenc_lsx x264_template(load_deinterleave_chroma_fenc_lsx)
 void x264_load_deinterleave_chroma_fenc_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fdec_lsx x264_template(load_deinterleave_chroma_fdec_lsx)
 void x264_load_deinterleave_chroma_fdec_lsx( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_plane_copy_interleave_core_lsx x264_template(plane_copy_interleave_core_lsx)
 void x264_plane_copy_interleave_core_lsx( pixel *dst,  intptr_t i_dst,
                                          pixel *srcu, intptr_t i_srcu,
                                          pixel *srcv, intptr_t i_srcv, int w, int h );
 #define x264_plane_copy_deinterleave_lsx x264_template(plane_copy_deinterleave_lsx)
 void x264_plane_copy_deinterleave_lsx( pixel *dstu, intptr_t i_dstu,
                                       pixel *dstv, intptr_t i_dstv,
                                       pixel *src,  intptr_t i_src, int w, int h );
 #define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
 void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
                                        pixel *dstv, intptr_t i_dstv,
                                        pixel *src,  intptr_t i_src, int w, int h );
 #define x264_prefetch_fenc_420_lsx x264_template(prefetch_fenc_420_lsx)
 void x264_prefetch_fenc_420_lsx( uint8_t *pix_y, intptr_t stride_y,
                                 uint8_t *pix_uv, intptr_t stride_uv,
                                 int32_t mb_x );
 #define x264_prefetch_fenc_422_lsx x264_template(prefetch_fenc_422_lsx)
 void x264_prefetch_fenc_422_lsx( uint8_t *pix_y, intptr_t stride_y,
                                 uint8_t *pix_uv, intptr_t stride_uv,
                                 int32_t mb_x );
 #define x264_prefetch_ref_lsx x264_template(prefetch_ref_lsx)
 void x264_prefetch_ref_lsx( uint8_t *pix, intptr_t stride, int32_t parity );
 #define x264_memcpy_aligned_lsx x264_template(memcpy_aligned_lsx)
 void *x264_memcpy_aligned_lsx( void *dst, const void *src, size_t n );
 #define x264_memzero_aligned_lsx x264_template(memzero_aligned_lsx)
 void x264_memzero_aligned_lsx( void *p_dst, size_t n );
 #define x264_hpel_filter_lsx x264_template(hpel_filter_lsx)
 void x264_hpel_filter_lsx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
 #define x264_frame_init_lowres_core_lsx x264_template(frame_init_lowres_core_lsx)
 void x264_frame_init_lowres_core_lsx( uint8_t *, uint8_t *, uint8_t *, uint8_t *,
                                      uint8_t *, intptr_t, intptr_t, int, int );
 #define x264_pixel_avg_16x8_lasx x264_template(pixel_avg_16x8_lasx)
 void x264_pixel_avg_16x8_lasx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 #define x264_pixel_avg_8x16_lasx x264_template(pixel_avg_8x16_lasx)
 void x264_pixel_avg_8x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_8x8_lasx x264_template(pixel_avg_8x8_lasx)
 void x264_pixel_avg_8x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_8x4_lasx x264_template(pixel_avg_8x4_lasx)
 void x264_pixel_avg_8x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x16_lasx x264_template(pixel_avg_4x16_lasx)
 void x264_pixel_avg_4x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x8_lasx x264_template(pixel_avg_4x8_lasx)
 void x264_pixel_avg_4x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x4_lasx x264_template(pixel_avg_4x4_lasx)
 void x264_pixel_avg_4x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg_4x2_lasx x264_template(pixel_avg_4x2_lasx)
 void x264_pixel_avg_4x2_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_pixel_avg2_w4_lasx x264_template(pixel_avg2_w4_lasx)
 void x264_pixel_avg2_w4_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w8_lasx x264_template(pixel_avg2_w8_lasx)
 void x264_pixel_avg2_w8_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w16_lasx x264_template(pixel_avg2_w16_lasx)
 void x264_pixel_avg2_w16_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_pixel_avg2_w20_lasx x264_template(pixel_avg2_w20_lasx)
 void x264_pixel_avg2_w20_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 #define x264_mc_weight_w20_lasx x264_template(mc_weight_w20_lasx)
 void x264_mc_weight_w20_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w20_noden_lasx x264_template(mc_weight_w20_noden_lasx)
 void x264_mc_weight_w20_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w16_lasx x264_template(mc_weight_w16_lasx)
 void x264_mc_weight_w16_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w16_noden_lasx x264_template(mc_weight_w16_noden_lasx)
 void x264_mc_weight_w16_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w8_lasx x264_template(mc_weight_w8_lasx)
 void x264_mc_weight_w8_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w8_noden_lasx x264_template(mc_weight_w8_noden_lasx)
 void x264_mc_weight_w8_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w4_lasx x264_template(mc_weight_w4_lasx)
 void x264_mc_weight_w4_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_weight_w4_noden_lasx x264_template(mc_weight_w4_noden_lasx)
 void x264_mc_weight_w4_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 #define x264_mc_copy_w16_lasx x264_template(mc_copy_w16_lasx)
 void x264_mc_copy_w16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_copy_w8_lasx x264_template(mc_copy_w8_lasx)
 void x264_mc_copy_w8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_mc_copy_w4_lasx x264_template(mc_copy_w4_lasx)
 void x264_mc_copy_w4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 #define x264_plane_copy_interleave_core_lasx x264_template(plane_copy_interleave_core_lasx)
 void x264_plane_copy_interleave_core_lasx( pixel *dst,  intptr_t i_dst,
                                           pixel *srcu, intptr_t i_srcu,
                                           pixel *srcv, intptr_t i_srcv, int w, int h );
 #define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx)
 void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu,
                                        pixel *dstv, intptr_t i_dstv,
                                        pixel *src,  intptr_t i_src, int w, int h );
 #define x264_memzero_aligned_lasx x264_template(memzero_aligned_lasx)
 void x264_memzero_aligned_lasx( void *p_dst, size_t n );
 #define x264_hpel_filter_lasx x264_template(hpel_filter_lasx)
 void x264_hpel_filter_lasx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * );
 #define x264_frame_init_lowres_core_lasx x264_template(frame_init_lowres_core_lasx)
 void x264_frame_init_lowres_core_lasx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *,
                                       intptr_t, intptr_t, int, int );
 #endif
--- a/common/loongarch/pixel-a.S
+++ b/common/loongarch/pixel-a.S
--- a/common/loongarch/pixel-c.c
+++ b/common/loongarch/pixel-c.c
@@ -0,0 +1,259 @@
 /*****************************************************************************
 * pixel-c.c: loongarch pixel metrics
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Hecai Yuan <yuanhecai@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "pixel.h"
 #include "predict.h"
 #if !HIGH_BIT_DEPTH
 uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride )
 {
    uint64_t u_sum;
    u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
 }
 uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride )
 {
    uint64_t u_sum;
    u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
    u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride );
    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
 }
 uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride )
 {
    uint64_t u_sum;
    u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
    u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride );
    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
 }
 uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride )
 {
    uint64_t u_sum;
    u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride );
    u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride );
    u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride );
    u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride + 8, i_stride );
    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
 }
 uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride )
 {
    uint64_t u_sum;
    u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride );
    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
 }
 uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride )
 {
    uint64_t u_sum;
    u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride );
    u_sum += x264_hadamard_ac_8x8_lasx( p_pix + ( i_stride << 3 ), i_stride );
    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
 }
 void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
                                 int32_t p_sad_array[3] )
 {
    ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
    x264_predict_8x8_v_lsx( pix, p_edge );
    p_sad_array[0] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
    x264_predict_8x8_h_lsx( pix, p_edge );
    p_sad_array[1] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
    x264_predict_8x8_dc_lsx( pix, p_edge );
    p_sad_array[2] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
 }
 void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36],
                                  int32_t p_sad_array[3] )
 {
    ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
    x264_predict_8x8_v_lsx( pix, p_edge );
    p_sad_array[0] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
                                               p_enc, FENC_STRIDE );
    x264_predict_8x8_h_lasx( pix, p_edge );
    p_sad_array[1] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
                                               p_enc, FENC_STRIDE );
    x264_predict_8x8_dc_lsx( pix, p_edge );
    p_sad_array[2] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE,
                                               p_enc, FENC_STRIDE );
 }
 void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                 int32_t p_sad_array[3] )
 {
    x264_predict_4x4_v_lsx( p_dec );
    p_sad_array[0] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
    x264_predict_4x4_h_lsx( p_dec );
    p_sad_array[1] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
    x264_predict_4x4_dc_lsx( p_dec );
    p_sad_array[2] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
 }
 void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                   int32_t p_sad_array[3] )
 {
    x264_predict_16x16_v_lsx( p_dec );
    p_sad_array[0] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
                                                p_enc, FENC_STRIDE );
    x264_predict_16x16_h_lsx( p_dec );
    p_sad_array[1] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
                                                p_enc, FENC_STRIDE );
    x264_predict_16x16_dc_lsx( p_dec );
    p_sad_array[2] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE,
                                                p_enc, FENC_STRIDE );
 }
 void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec,
                                    int32_t p_sad_array[3] )
 {
    x264_predict_16x16_v_lsx( p_dec );
    p_sad_array[0] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
                                                 p_enc, FENC_STRIDE );
    x264_predict_16x16_h_lsx( p_dec );
    p_sad_array[1] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
                                                 p_enc, FENC_STRIDE );
    x264_predict_16x16_dc_lsx( p_dec );
    p_sad_array[2] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE,
                                                 p_enc, FENC_STRIDE );
 }
 void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                  int32_t p_sad_array[3] )
 {
    x264_predict_8x8c_dc_lsx( p_dec );
    p_sad_array[0] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
    x264_predict_8x8c_h_lsx( p_dec );
    p_sad_array[1] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
    x264_predict_8x8c_v_lsx( p_dec );
    p_sad_array[2] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE,
                                              p_enc, FENC_STRIDE );
 }
 void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                int32_t p_sad_array[3] )
 {
    x264_predict_4x4_v_lsx( p_dec );
    p_sad_array[0] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
    x264_predict_4x4_h_lsx( p_dec );
    p_sad_array[1] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
    x264_predict_4x4_dc_lsx( p_dec );
    p_sad_array[2] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
 }
 void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                  int32_t p_sad_array[3] )
 {
    x264_predict_16x16_v_lsx( p_dec );
    p_sad_array[0] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
                                               p_enc, FENC_STRIDE );
    x264_predict_16x16_h_lsx( p_dec );
    p_sad_array[1] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
                                               p_enc, FENC_STRIDE );
    x264_predict_16x16_dc_lsx( p_dec );
    p_sad_array[2] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE,
                                               p_enc, FENC_STRIDE );
 }
 void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
                                int32_t p_sad_array[3] )
 {
    ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
    x264_predict_8x8_v_lsx( pix, p_edge );
    p_sad_array[0] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
    x264_predict_8x8_h_lsx( pix, p_edge );
    p_sad_array[1] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
    x264_predict_8x8_dc_lsx( pix, p_edge );
    p_sad_array[2] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
 }
 void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                 int32_t p_sad_array[3] )
 {
    x264_predict_8x8c_dc_lsx( p_dec );
    p_sad_array[0] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
    x264_predict_8x8c_h_lsx( p_dec );
    p_sad_array[1] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
    x264_predict_8x8c_v_lsx( p_dec );
    p_sad_array[2] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE,
                                             p_enc, FENC_STRIDE );
 }
 #endif
--- a/common/loongarch/pixel.h
+++ b/common/loongarch/pixel.h
@@ -0,0 +1,335 @@
 /*****************************************************************************
 * pixel.h: loongarch pixel metrics
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Lu Wang <wanglu@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_LOONGARCH_PIXEL_H
 #define X264_LOONGARCH_PIXEL_H
 #define x264_pixel_satd_4x4_lsx x264_template(pixel_satd_4x4_lsx)
 int32_t x264_pixel_satd_4x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_4x8_lsx x264_template(pixel_satd_4x8_lsx)
 int32_t x264_pixel_satd_4x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_4x16_lsx x264_template(pixel_satd_4x16_lsx)
 int32_t x264_pixel_satd_4x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x4_lsx x264_template(pixel_satd_8x4_lsx)
 int32_t x264_pixel_satd_8x4_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x8_lsx x264_template(pixel_satd_8x8_lsx)
 int32_t x264_pixel_satd_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x16_lsx x264_template(pixel_satd_8x16_lsx)
 int32_t x264_pixel_satd_8x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_16x8_lsx x264_template(pixel_satd_16x8_lsx)
 int32_t x264_pixel_satd_16x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_16x16_lsx x264_template(pixel_satd_16x16_lsx)
 int32_t x264_pixel_satd_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                   uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_4x8_lasx x264_template(pixel_satd_4x8_lasx)
 int32_t x264_pixel_satd_4x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_4x16_lasx x264_template(pixel_satd_4x16_lasx)
 int32_t x264_pixel_satd_4x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                   uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x4_lasx x264_template(pixel_satd_8x4_lasx)
 int32_t x264_pixel_satd_8x4_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x8_lasx x264_template(pixel_satd_8x8_lasx)
 int32_t x264_pixel_satd_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x16_lasx x264_template(pixel_satd_8x16_lasx)
 int32_t x264_pixel_satd_8x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                   uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_16x8_lasx x264_template(pixel_satd_16x8_lasx)
 int32_t x264_pixel_satd_16x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                   uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_16x16_lasx x264_template(pixel_satd_16x16_lasx)
 int32_t x264_pixel_satd_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                    uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_sad_x4_16x16_lsx x264_template(pixel_sad_x4_16x16_lsx)
 void x264_pixel_sad_x4_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                  uint8_t *p_ref1, uint8_t *p_ref2,
                                  uint8_t *p_ref3, intptr_t i_ref_stride,
                                  int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_16x8_lsx x264_template(pixel_sad_x4_16x8_lsx)
 void x264_pixel_sad_x4_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 uint8_t *p_ref3, intptr_t i_ref_stride,
                                 int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x16_lsx x264_template(pixel_sad_x4_8x16_lsx)
 void x264_pixel_sad_x4_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 uint8_t *p_ref3, intptr_t i_ref_stride,
                                 int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x8_lsx x264_template(pixel_sad_x4_8x8_lsx)
 void x264_pixel_sad_x4_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x4_lsx x264_template(pixel_sad_x4_8x4_lsx)
 void x264_pixel_sad_x4_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_4x8_lsx x264_template(pixel_sad_x4_4x8_lsx)
 void x264_pixel_sad_x4_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_16x16_lasx x264_template(pixel_sad_x4_16x16_lasx)
 void x264_pixel_sad_x4_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
                                   uint8_t *p_ref1, uint8_t *p_ref2,
                                   uint8_t *p_ref3, intptr_t i_ref_stride,
                                   int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_16x8_lasx x264_template(pixel_sad_x4_16x8_lasx)
 void x264_pixel_sad_x4_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
                                  uint8_t *p_ref1, uint8_t *p_ref2,
                                  uint8_t *p_ref3, intptr_t i_ref_stride,
                                  int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x8_lasx x264_template(pixel_sad_x4_8x8_lasx)
 void x264_pixel_sad_x4_8x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 uint8_t *p_ref3, intptr_t i_ref_stride,
                                 int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x4_lasx x264_template(pixel_sad_x4_8x4_lasx)
 void x264_pixel_sad_x4_8x4_lasx( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 uint8_t *p_ref3, intptr_t i_ref_stride,
                                 int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_4x4_lsx x264_template(pixel_sad_x4_4x4_lsx)
 void x264_pixel_sad_x4_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x3_16x16_lsx x264_template(pixel_sad_x3_16x16_lsx)
 void x264_pixel_sad_x3_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                  uint8_t *p_ref1, uint8_t *p_ref2,
                                  intptr_t i_ref_stride,
                                  int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_16x8_lsx x264_template(pixel_sad_x3_16x8_lsx)
 void x264_pixel_sad_x3_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 intptr_t i_ref_stride,
                                 int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_8x16_lsx x264_template(pixel_sad_x3_8x16_lsx)
 void x264_pixel_sad_x3_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 intptr_t i_ref_stride,
                                 int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_8x8_lsx x264_template(pixel_sad_x3_8x8_lsx)
 void x264_pixel_sad_x3_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_8x4_lsx x264_template(pixel_sad_x3_8x4_lsx)
 void x264_pixel_sad_x3_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_4x4_lsx x264_template(pixel_sad_x3_4x4_lsx)
 void x264_pixel_sad_x3_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_4x8_lsx x264_template(pixel_sad_x3_4x8_lsx)
 void x264_pixel_sad_x3_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_16x16_lasx x264_template(pixel_sad_x3_16x16_lasx)
 void x264_pixel_sad_x3_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0,
                                  uint8_t *p_ref1, uint8_t *p_ref2,
                                  intptr_t i_ref_stride,
                                  int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_16x8_lasx x264_template(pixel_sad_x3_16x8_lasx)
 void x264_pixel_sad_x3_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0,
                                  uint8_t *p_ref1, uint8_t *p_ref2,
                                  intptr_t i_ref_stride,
                                  int32_t p_sad_array[3] );
 #define x264_pixel_sad_16x16_lsx x264_template(pixel_sad_16x16_lsx)
 int32_t x264_pixel_sad_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                  uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_16x8_lsx x264_template(pixel_sad_16x8_lsx)
 int32_t x264_pixel_sad_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_8x16_lsx x264_template(pixel_sad_8x16_lsx)
 int32_t x264_pixel_sad_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_8x8_lsx x264_template(pixel_sad_8x8_lsx)
 int32_t x264_pixel_sad_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_8x4_lsx x264_template(pixel_sad_8x4_lsx)
 int32_t x264_pixel_sad_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_4x16_lsx x264_template(pixel_sad_4x16_lsx)
 int32_t x264_pixel_sad_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_4x8_lsx x264_template(pixel_sad_4x8_lsx)
 int32_t x264_pixel_sad_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_4x4_lsx x264_template(pixel_sad_4x4_lsx)
 int32_t x264_pixel_sad_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_8x4_lasx x264_template(pixel_sad_8x4_lasx)
 int32_t x264_pixel_sad_8x4_lasx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_hadamard_ac_8x8_lsx x264_template(hadamard_ac_8x8_lsx)
 uint64_t x264_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_8x8_lsx x264_template(pixel_hadamard_ac_8x8_lsx)
 uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_8x16_lsx x264_template(pixel_hadamard_ac_8x16_lsx)
 uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_16x8_lsx x264_template(pixel_hadamard_ac_16x8_lsx)
 uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_16x16_lsx x264_template(pixel_hadamard_ac_16x16_lsx)
 uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_hadamard_ac_8x8_lasx x264_template(hadamard_ac_8x8_lasx)
 uint64_t x264_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_8x8_lasx x264_template(pixel_hadamard_ac_8x8_lasx)
 uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_8x16_lasx x264_template(pixel_hadamard_ac_8x16_lasx)
 uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_16x8_lasx x264_template(pixel_hadamard_ac_16x8_lasx)
 uint64_t x264_pixel_hadamard_ac_16x8_lasx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_16x16_lasx x264_template(pixel_hadamard_ac_16x16_lasx)
 uint64_t x264_pixel_hadamard_ac_16x16_lasx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_intra_satd_x3_16x16_lsx x264_template(intra_satd_x3_16x16_lsx)
 void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                   int32_t p_sad_array[3] );
 #define x264_intra_satd_x3_8x8c_lsx x264_template(intra_satd_x3_8x8c_lsx)
 void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                  int32_t p_sad_array[3] );
 #define x264_intra_satd_x3_4x4_lsx x264_template(intra_satd_x3_4x4_lsx)
 void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                 int32_t p_sad_array[3] );
 #define x264_intra_satd_x3_16x16_lasx x264_template(intra_satd_x3_16x16_lasx)
 void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec,
                                    int32_t p_sad_array[3] );
 #define x264_pixel_ssd_16x16_lsx x264_template(pixel_ssd_16x16_lsx)
 int32_t x264_pixel_ssd_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                  uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_16x8_lsx x264_template(pixel_ssd_16x8_lsx)
 int32_t x264_pixel_ssd_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x16_lsx x264_template(pixel_ssd_8x16_lsx)
 int32_t x264_pixel_ssd_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x8_lsx x264_template(pixel_ssd_8x8_lsx)
 int32_t x264_pixel_ssd_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x4_lsx x264_template(pixel_ssd_8x4_lsx)
 int32_t x264_pixel_ssd_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_4x16_lsx x264_template(pixel_ssd_4x16_lsx)
 int32_t x264_pixel_ssd_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_4x8_lsx x264_template(pixel_ssd_4x8_lsx)
 int32_t x264_pixel_ssd_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_4x4_lsx x264_template(pixel_ssd_4x4_lsx)
 int32_t x264_pixel_ssd_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_16x16_lasx x264_template(pixel_ssd_16x16_lasx)
 int32_t x264_pixel_ssd_16x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
                                   uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_16x8_lasx x264_template(pixel_ssd_16x8_lasx)
 int32_t x264_pixel_ssd_16x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
                                  uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x16_lasx x264_template(pixel_ssd_8x16_lasx)
 int32_t x264_pixel_ssd_8x16_lasx( uint8_t *p_src, intptr_t i_src_stride,
                                  uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x8_lasx x264_template(pixel_ssd_8x8_lasx)
 int32_t x264_pixel_ssd_8x8_lasx( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_var2_8x16_lsx x264_template(pixel_var2_8x16_lsx)
 int32_t x264_pixel_var2_8x16_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
                                   int32_t ssd[2] );
 #define x264_pixel_var2_8x8_lsx x264_template(pixel_var2_8x8_lsx)
 int32_t x264_pixel_var2_8x8_lsx( uint8_t *p_pix1, uint8_t *p_pix2,
                                 int32_t ssd[2] );
 #define x264_pixel_var_16x16_lsx x264_template(pixel_var_16x16_lsx)
 uint64_t x264_pixel_var_16x16_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_var_8x16_lsx x264_template(pixel_var_8x16_lsx)
 uint64_t x264_pixel_var_8x16_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_var_8x8_lsx x264_template(pixel_var_8x8_lsx)
 uint64_t x264_pixel_var_8x8_lsx( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_var2_8x16_lasx x264_template(pixel_var2_8x16_lasx)
 int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
                                   int32_t ssd[2] );
 #define x264_pixel_var2_8x8_lasx x264_template(pixel_var2_8x8_lasx)
 int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2,
                                  int32_t ssd[2] );
 #define x264_pixel_sa8d_8x8_lsx x264_template(pixel_sa8d_8x8_lsx)
 int32_t x264_pixel_sa8d_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_sa8d_16x16_lsx x264_template(pixel_sa8d_16x16_lsx)
 int32_t x264_pixel_sa8d_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride,
                                   uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_intra_sa8d_x3_8x8_lsx x264_template(intra_sa8d_x3_8x8_lsx)
 void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
                                 int32_t p_sad_array[3] );
 #define x264_intra_sa8d_x3_8x8_lasx x264_template(intra_sa8d_x3_8x8_lasx)
 void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36],
                                  int32_t p_sad_array[3] );
 #define x264_pixel_sa8d_8x8_lasx x264_template(pixel_sa8d_8x8_lasx)
 int32_t x264_pixel_sa8d_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_sa8d_16x16_lasx x264_template(pixel_sa8d_16x16_lasx)
 int32_t x264_pixel_sa8d_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride,
                                    uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_intra_sad_x3_16x16_lsx x264_template(intra_sad_x3_16x16_lsx)
 void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                  int32_t p_sad_array[3] );
 #define x264_intra_sad_x3_8x8_lsx x264_template(intra_sad_x3_8x8_lsx)
 void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36],
                                int32_t p_sad_array[3] );
 #define x264_intra_sad_x3_8x8c_lsx x264_template(intra_sad_x3_8x8c_lsx)
 void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                 int32_t p_sad_array[3] );
 #define x264_intra_sad_x3_4x4_lsx x264_template(intra_sad_x3_4x4_lsx)
 void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec,
                                int32_t p_sad_array[3] );
 #endif
--- a/common/loongarch/predict-a.S
+++ b/common/loongarch/predict-a.S
--- a/common/loongarch/predict-c.c
+++ b/common/loongarch/predict-c.c
@@ -0,0 +1,106 @@
 /*****************************************************************************
 * predict-c.c: loongarch intra prediction
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "predict.h"
 void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] )
 {
 #if !HIGH_BIT_DEPTH
    if( cpu&X264_CPU_LSX )
    {
        pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_lsx;
        pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_lsx;
        pf[I_PRED_16x16_DC]     = x264_predict_16x16_dc_lsx;
        pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_lsx;
        pf[I_PRED_16x16_DC_TOP ]= x264_predict_16x16_dc_top_lsx;
        pf[I_PRED_16x16_DC_128 ]= x264_predict_16x16_dc_128_lsx;
        pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_lsx;
    }
    if( cpu&X264_CPU_LASX )
    {
        pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_lasx;
    }
 #endif
 }
 void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] )
 {
 #if !HIGH_BIT_DEPTH
    if( cpu&X264_CPU_LSX )
    {
        pf[I_PRED_CHROMA_P]      = x264_predict_8x8c_p_lsx;
        pf[I_PRED_CHROMA_V]      = x264_predict_8x8c_v_lsx;
        pf[I_PRED_CHROMA_H]      = x264_predict_8x8c_h_lsx;
        pf[I_PRED_CHROMA_DC]     = x264_predict_8x8c_dc_lsx;
        pf[I_PRED_CHROMA_DC_128] = x264_predict_8x8c_dc_128_lsx;
        pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_lsx;
        pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x8c_dc_left_lsx;
    }
 #endif
 }
 void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
 {
 #if !HIGH_BIT_DEPTH
    if( cpu&X264_CPU_LSX )
    {
        pf[I_PRED_8x8_V]      = x264_predict_8x8_v_lsx;
        pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_lsx;
        pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_lsx;
        pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_lsx;
        pf[I_PRED_8x8_DC_128] = x264_predict_8x8_dc_128_lsx;
        pf[I_PRED_8x8_H]      = x264_predict_8x8_h_lsx;
        pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_lsx;
        pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_lsx;
        pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_lsx;
        pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_lsx;
    }
    if( cpu&X264_CPU_LASX )
    {
        pf[I_PRED_8x8_H]      = x264_predict_8x8_h_lasx;
        pf[I_PRED_8x8_DDL]    = x264_predict_8x8_ddl_lasx;
        pf[I_PRED_8x8_DDR]    = x264_predict_8x8_ddr_lasx;
        pf[I_PRED_8x8_VR]     = x264_predict_8x8_vr_lasx;
        pf[I_PRED_8x8_VL]     = x264_predict_8x8_vl_lasx;
    }
 #endif
 }
 void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] )
 {
 #if !HIGH_BIT_DEPTH
    if( cpu&X264_CPU_LSX )
    {
        pf[I_PRED_4x4_V]      = x264_predict_4x4_v_lsx;
        pf[I_PRED_4x4_H]      = x264_predict_4x4_h_lsx;
        pf[I_PRED_4x4_DC]     = x264_predict_4x4_dc_lsx;
        pf[I_PRED_4x4_DDL]    = x264_predict_4x4_ddl_lsx;
        pf[I_PRED_4x4_DC_LEFT]= x264_predict_4x4_dc_left_lsx;
        pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_lsx;
        pf[I_PRED_4x4_DC_128] = x264_predict_4x4_dc_128_lsx;
    }
 #endif
 }
--- a/common/loongarch/predict.h
+++ b/common/loongarch/predict.h
@@ -0,0 +1,150 @@
 /*****************************************************************************
 * predict.h: loongarch intra prediction
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Xiwei Gu <guxiwei-hf@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_LOONGARCH_PREDICT_H
 #define X264_LOONGARCH_PREDICT_H
 #define x264_predict_8x8c_p_lsx x264_template(predict_8x8c_p_lsx)
 void x264_predict_8x8c_p_lsx(uint8_t *p_src);
 #define x264_predict_8x8c_v_lsx x264_template(predict_8x8c_v_lsx)
 void x264_predict_8x8c_v_lsx(uint8_t *p_src);
 #define x264_predict_8x8c_h_lsx x264_template(predict_8x8c_h_lsx)
 void x264_predict_8x8c_h_lsx(uint8_t *p_src);
 #define x264_predict_8x8c_dc_lsx x264_template(predict_8x8c_dc_lsx)
 void x264_predict_8x8c_dc_lsx(pixel *src);
 #define x264_predict_8x8c_dc_128_lsx x264_template(predict_8x8c_dc_128_lsx)
 void x264_predict_8x8c_dc_128_lsx(pixel *src);
 #define x264_predict_8x8c_dc_top_lsx x264_template(predict_8x8c_dc_top_lsx)
 void x264_predict_8x8c_dc_top_lsx(pixel *src);
 #define x264_predict_8x8c_dc_left_lsx x264_template(predict_8x8c_dc_left_lsx)
 void x264_predict_8x8c_dc_left_lsx(pixel *src);
 #define x264_predict_16x16_dc_lsx x264_template(predict_16x16_dc_lsx)
 void x264_predict_16x16_dc_lsx( pixel *src );
 #define x264_predict_16x16_dc_left_lsx x264_template(predict_16x16_dc_left_lsx)
 void x264_predict_16x16_dc_left_lsx( pixel *src );
 #define x264_predict_16x16_dc_top_lsx x264_template(predict_16x16_dc_top_lsx)
 void x264_predict_16x16_dc_top_lsx( pixel *src );
 #define x264_predict_16x16_dc_128_lsx x264_template(predict_16x16_dc_128_lsx)
 void x264_predict_16x16_dc_128_lsx( pixel *src );
 #define x264_predict_16x16_h_lsx x264_template(predict_16x16_h_lsx)
 void x264_predict_16x16_h_lsx( pixel *src );
 #define x264_predict_16x16_v_lsx x264_template(predict_16x16_v_lsx)
 void x264_predict_16x16_v_lsx( pixel *src );
 #define x264_predict_16x16_p_lasx x264_template(predict_16x16_p_lasx)
 void x264_predict_16x16_p_lasx( pixel *src );
 #define x264_predict_16x16_p_lsx x264_template(predict_16x16_p_lsx)
 void x264_predict_16x16_p_lsx( pixel *src );
 #define x264_predict_8x8_v_lsx x264_template(predict_8x8_v_lsx)
 void x264_predict_8x8_v_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_h_lasx x264_template(predict_8x8_h_lasx)
 void x264_predict_8x8_h_lasx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_h_lsx x264_template(predict_8x8_h_lsx)
 void x264_predict_8x8_h_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_dc_lsx x264_template(predict_8x8_dc_lsx)
 void x264_predict_8x8_dc_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_dc_left_lsx x264_template(predict_8x8_dc_left_lsx)
 void x264_predict_8x8_dc_left_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_dc_top_lsx x264_template(predict_8x8_dc_top_lsx)
 void x264_predict_8x8_dc_top_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_dc_128_lsx x264_template(predict_8x8_dc_128_lsx)
 void x264_predict_8x8_dc_128_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_ddl_lasx x264_template(predict_8x8_ddl_lasx)
 void x264_predict_8x8_ddl_lasx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_ddl_lsx x264_template(predict_8x8_ddl_lsx)
 void x264_predict_8x8_ddl_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_ddr_lasx x264_template(predict_8x8_ddr_lasx)
 void x264_predict_8x8_ddr_lasx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_ddr_lsx x264_template(predict_8x8_ddr_lsx)
 void x264_predict_8x8_ddr_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_vr_lasx x264_template(predict_8x8_vr_lasx)
 void x264_predict_8x8_vr_lasx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_vr_lsx x264_template(predict_8x8_vr_lsx)
 void x264_predict_8x8_vr_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_vl_lasx x264_template(predict_8x8_vl_lasx)
 void x264_predict_8x8_vl_lasx( pixel *src, pixel edge[36] );
 #define x264_predict_8x8_vl_lsx x264_template(predict_8x8_vl_lsx)
 void x264_predict_8x8_vl_lsx( pixel *src, pixel edge[36] );
 #define x264_predict_4x4_v_lsx x264_template(predict_4x4_v_lsx)
 void x264_predict_4x4_v_lsx( pixel *p_src );
 #define x264_predict_4x4_h_lsx x264_template(predict_4x4_h_lsx)
 void x264_predict_4x4_h_lsx( pixel *p_src );
 #define x264_predict_4x4_dc_lsx x264_template(predict_4x4_dc_lsx)
 void x264_predict_4x4_dc_lsx( pixel *p_src );
 #define x264_predict_4x4_ddl_lsx x264_template(predict_4x4_ddl_lsx)
 void x264_predict_4x4_ddl_lsx( pixel *p_src );
 #define x264_predict_4x4_dc_top_lsx x264_template(predict_4x4_dc_top_lsx)
 void x264_predict_4x4_dc_top_lsx( pixel *p_src );
 #define x264_predict_4x4_dc_left_lsx x264_template(predict_4x4_dc_left_lsx)
 void x264_predict_4x4_dc_left_lsx( pixel *p_src );
 #define x264_predict_4x4_dc_128_lsx x264_template(predict_4x4_dc_128_lsx)
 void x264_predict_4x4_dc_128_lsx( pixel *p_src );
 #define x264_predict_4x4_init_loongarch x264_template(predict_4x4_init_loongarch)
 void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] );
 #define x264_predict_8x8_init_loongarch x264_template(predict_8x8_init_loongarch)
 void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12],
                                      x264_predict_8x8_filter_t *predict_filter );
 #define x264_predict_8x8c_init_loongarch x264_template(predict_8x8c_init_loongarch)
 void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] );
 #define x264_predict_16x16_init_loongarch x264_template(predict_16x16_init_loongarch)
 void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] );
 #endif
--- a/common/loongarch/quant-a.S
+++ b/common/loongarch/quant-a.S
--- a/common/loongarch/quant.h
+++ b/common/loongarch/quant.h
@@ -0,0 +1,96 @@
 /*****************************************************************************
 * quant.h: loongarch quantization and level-run
 *****************************************************************************
 * Copyright (C) 2023-2025 x264 project
 *
 * Authors: Shiyou Yin <yinshiyou-hf@loongson.cn>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_LOONGARCH_QUANT_H
 #define X264_LOONGARCH_QUANT_H
 #define x264_coeff_last64_lsx x264_template(coeff_last64_lsx)
 int32_t x264_coeff_last64_lsx( int16_t *p_src );
 #define x264_coeff_last16_lsx x264_template(coeff_last16_lsx)
 int32_t x264_coeff_last16_lsx( int16_t *p_src );
 #define x264_coeff_last15_lsx x264_template(coeff_last15_lsx)
 int32_t x264_coeff_last15_lsx( int16_t *p_src );
 #define x264_coeff_last8_lsx x264_template(coeff_last8_lsx)
 int32_t x264_coeff_last8_lsx( int16_t *p_src );
 #define x264_coeff_last4_lsx x264_template(coeff_last4_lsx)
 int32_t x264_coeff_last4_lsx( int16_t *p_src );
 #define x264_quant_4x4_lsx x264_template(quant_4x4_lsx)
 int32_t x264_quant_4x4_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
 #define x264_quant_4x4x4_lsx x264_template(quant_4x4x4_lsx)
 int32_t x264_quant_4x4x4_lsx( int16_t p_dct[4][16],
                               uint16_t pu_mf[16], uint16_t pu_bias[16] );
 #define x264_quant_8x8_lsx x264_template(quant_8x8_lsx)
 int32_t x264_quant_8x8_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
 #define x264_quant_4x4_dc_lsx x264_template(quant_4x4_dc_lsx)
 int32_t x264_quant_4x4_dc_lsx( dctcoef dct[16], int32_t mf, int32_t bias );
 #define x264_quant_2x2_dc_lsx x264_template(quant_2x2_dc_lsx)
 int32_t x264_quant_2x2_dc_lsx( dctcoef dct[4], int32_t mf, int32_t bias );
 #define x264_dequant_4x4_lsx x264_template(dequant_4x4_lsx)
 void x264_dequant_4x4_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_dequant_8x8_lsx x264_template(dequant_8x8_lsx)
 void x264_dequant_8x8_lsx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 #define x264_dequant_4x4_dc_lsx x264_template(dequant_4x4_dc_lsx)
 void x264_dequant_4x4_dc_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_decimate_score15_lsx x264_template(decimate_score15_lsx)
 int x264_decimate_score15_lsx( dctcoef *dct );
 #define x264_decimate_score16_lsx x264_template(decimate_score16_lsx)
 int x264_decimate_score16_lsx( dctcoef *dct );
 #define x264_decimate_score64_lsx x264_template(decimate_score64_lsx)
 int x264_decimate_score64_lsx( dctcoef *dct );
 #define x264_coeff_last64_lasx x264_template(coeff_last64_lasx)
 int32_t x264_coeff_last64_lasx( int16_t *p_src );
 #define x264_coeff_last16_lasx x264_template(coeff_last16_lasx)
 int32_t x264_coeff_last16_lasx( int16_t *p_src );
 #define x264_coeff_last15_lasx x264_template(coeff_last15_lasx)
 int32_t x264_coeff_last15_lasx( int16_t *p_src );
 #define x264_quant_4x4x4_lasx x264_template(quant_4x4x4_lasx)
 int32_t x264_quant_4x4x4_lasx( int16_t p_dct[4][16],
                               uint16_t pu_mf[16], uint16_t pu_bias[16] );
 #define x264_dequant_4x4_lasx x264_template(dequant_4x4_lasx)
 void x264_dequant_4x4_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_dequant_8x8_lasx x264_template(dequant_8x8_lasx)
 void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
 #define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx)
 void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
 #define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx)
 int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * );
 #define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx)
 int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * );
 #define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx)
 int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * );
 #define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx)
 int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * );
 #define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx)
 int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * );
 #endif/* X264_LOONGARCH_QUANT_H */
--- a/common/loongarch/sad-a.S
+++ b/common/loongarch/sad-a.S
--- a/common/macroblock.c
+++ b/common/macroblock.c
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -0,0 +1,463 @@
 /*****************************************************************************
 * macroblock.h: macroblock common functions
 *****************************************************************************
 * Copyright (C) 2005-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Fiona Glaser <fiona@x264.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_MACROBLOCK_H
 #define X264_MACROBLOCK_H
 enum macroblock_position_e
 {
    MB_LEFT     = 0x01,
    MB_TOP      = 0x02,
    MB_TOPRIGHT = 0x04,
    MB_TOPLEFT  = 0x08,
    MB_PRIVATE  = 0x10,
    ALL_NEIGHBORS = 0xf,
 };
 static const uint8_t x264_pred_i4x4_neighbors[12] =
 {
    MB_TOP,                         // I_PRED_4x4_V
    MB_LEFT,                        // I_PRED_4x4_H
    MB_LEFT | MB_TOP,               // I_PRED_4x4_DC
    MB_TOP  | MB_TOPRIGHT,          // I_PRED_4x4_DDL
    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_DDR
    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_VR
    MB_LEFT | MB_TOPLEFT | MB_TOP,  // I_PRED_4x4_HD
    MB_TOP  | MB_TOPRIGHT,          // I_PRED_4x4_VL
    MB_LEFT,                        // I_PRED_4x4_HU
    MB_LEFT,                        // I_PRED_4x4_DC_LEFT
    MB_TOP,                         // I_PRED_4x4_DC_TOP
    0                               // I_PRED_4x4_DC_128
 };
 /* XXX mb_type isn't the one written in the bitstream -> only internal usage */
 #define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 || (type) == I_PCM )
 #define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
 #define IS_DIRECT(type)  ( (type) == B_DIRECT )
 enum mb_class_e
 {
    I_4x4           = 0,
    I_8x8           = 1,
    I_16x16         = 2,
    I_PCM           = 3,
    P_L0            = 4,
    P_8x8           = 5,
    P_SKIP          = 6,
    B_DIRECT        = 7,
    B_L0_L0         = 8,
    B_L0_L1         = 9,
    B_L0_BI         = 10,
    B_L1_L0         = 11,
    B_L1_L1         = 12,
    B_L1_BI         = 13,
    B_BI_L0         = 14,
    B_BI_L1         = 15,
    B_BI_BI         = 16,
    B_8x8           = 17,
    B_SKIP          = 18,
    X264_MBTYPE_MAX = 19
 };
 static const uint8_t x264_mb_type_fix[X264_MBTYPE_MAX] =
 {
    I_4x4, I_4x4, I_16x16, I_PCM,
    P_L0, P_8x8, P_SKIP,
    B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1,
    B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP
 };
 static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] =
 {
    {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, /* INTRA */
    {{1,1},{0,0}},                                              /* P_L0 */
    {{0,0},{0,0}},                                              /* P_8x8 */
    {{1,1},{0,0}},                                              /* P_SKIP */
    {{0,0},{0,0}},                                              /* B_DIRECT */
    {{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}},                /* B_L0_* */
    {{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}},                /* B_L1_* */
    {{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}},                /* B_BI_* */
    {{0,0},{0,0}},                                              /* B_8x8 */
    {{0,0},{0,0}}                                               /* B_SKIP */
 };
 #define IS_SUB4x4(type) ( (type == D_L0_4x4)||(type == D_L1_4x4)||(type == D_BI_4x4) )
 #define IS_SUB4x8(type) ( (type == D_L0_4x8)||(type == D_L1_4x8)||(type == D_BI_4x8) )
 #define IS_SUB8x4(type) ( (type == D_L0_8x4)||(type == D_L1_8x4)||(type == D_BI_8x4) )
 #define IS_SUB8x8(type) ( (type == D_L0_8x8)||(type == D_L1_8x8)||(type == D_BI_8x8)||(type == D_DIRECT_8x8) )
 enum mb_partition_e
 {
    /* sub partition type for P_8x8 and B_8x8 */
    D_L0_4x4          = 0,
    D_L0_8x4          = 1,
    D_L0_4x8          = 2,
    D_L0_8x8          = 3,
    /* sub partition type for B_8x8 only */
    D_L1_4x4          = 4,
    D_L1_8x4          = 5,
    D_L1_4x8          = 6,
    D_L1_8x8          = 7,
    D_BI_4x4          = 8,
    D_BI_8x4          = 9,
    D_BI_4x8          = 10,
    D_BI_8x8          = 11,
    D_DIRECT_8x8      = 12,
    /* partition */
    D_8x8             = 13,
    D_16x8            = 14,
    D_8x16            = 15,
    D_16x16           = 16,
    X264_PARTTYPE_MAX = 17,
 };
 static const uint8_t x264_mb_partition_listX_table[2][17] =
 {{
    1, 1, 1, 1, /* D_L0_* */
    0, 0, 0, 0, /* D_L1_* */
    1, 1, 1, 1, /* D_BI_* */
    0,          /* D_DIRECT_8x8 */
    0, 0, 0, 0  /* 8x8 .. 16x16 */
 },
 {
    0, 0, 0, 0, /* D_L0_* */
    1, 1, 1, 1, /* D_L1_* */
    1, 1, 1, 1, /* D_BI_* */
    0,          /* D_DIRECT_8x8 */
    0, 0, 0, 0  /* 8x8 .. 16x16 */
 }};
 static const uint8_t x264_mb_partition_count_table[17] =
 {
    /* sub L0 */
    4, 2, 2, 1,
    /* sub L1 */
    4, 2, 2, 1,
    /* sub BI */
    4, 2, 2, 1,
    /* Direct */
    1,
    /* Partition */
    4, 2, 2, 1
 };
 static const uint8_t x264_mb_partition_pixel_table[17] =
 {
    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L0_* */
    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_L1_* */
    PIXEL_4x4, PIXEL_8x4,  PIXEL_4x8,  PIXEL_8x8,   /* D_BI_* */
    PIXEL_8x8,                                      /* D_DIRECT_8x8 */
    PIXEL_8x8, PIXEL_16x8, PIXEL_8x16, PIXEL_16x16, /* 8x8 .. 16x16 */
 };
 /* zigzags are transposed with respect to the tables in the standard */
 static const uint8_t x264_zigzag_scan4[2][16] =
 {{ // frame
    0,  4,  1,  2,  5,  8, 12,  9,  6,  3,  7, 10, 13, 14, 11, 15
 },
 {  // field
    0,  1,  4,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
 }};
 static const uint8_t x264_zigzag_scan8[2][64] =
 {{
    0,  8,  1,  2,  9, 16, 24, 17, 10,  3,  4, 11, 18, 25, 32, 40,
   33, 26, 19, 12,  5,  6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
   28, 21, 14,  7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
   23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
 },
 {
    0,  1,  2,  8,  9,  3,  4, 10, 16, 11,  5,  6,  7, 12, 17, 24,
   18, 13, 14, 15, 19, 25, 32, 26, 20, 21, 22, 23, 27, 33, 40, 34,
   28, 29, 30, 31, 35, 41, 48, 42, 36, 37, 38, 39, 43, 49, 50, 44,
   45, 46, 47, 51, 56, 57, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63
 }};
 static const uint8_t block_idx_x[16] =
 {
    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
 };
 static const uint8_t block_idx_y[16] =
 {
    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
 };
 static const uint8_t block_idx_xy[4][4] =
 {
    { 0, 2, 8,  10 },
    { 1, 3, 9,  11 },
    { 4, 6, 12, 14 },
    { 5, 7, 13, 15 }
 };
 static const uint8_t block_idx_xy_1d[16] =
 {
    0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
 };
 static const uint8_t block_idx_yx_1d[16] =
 {
    0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
 };
 static const uint8_t block_idx_xy_fenc[16] =
 {
    0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE,
    0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE,
    2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE,
    2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE,
    0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE,
    0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE,
    2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE,
    2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE
 };
 static const uint16_t block_idx_xy_fdec[16] =
 {
    0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE,
    0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE,
    2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE,
    2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE,
    0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE,
    0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE,
    2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE,
    2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
 };
 #define QP(qP) ( (qP)+QP_BD_OFFSET )
 static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
 {
         0,      0,      0,      0,      0,      0,
         0,      0,      0,      0,      0,      0,
 #if BIT_DEPTH > 9
   QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
 #endif
 #if BIT_DEPTH > 8
    QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
 #endif
     QP(0),  QP(1),  QP(2),  QP(3),  QP(4),  QP(5),
     QP(6),  QP(7),  QP(8),  QP(9), QP(10), QP(11),
    QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
    QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
    QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
    QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
    QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
    QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
    QP(39), QP(39), QP(39), QP(39),
    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
 };
 #undef QP
 enum cabac_ctx_block_cat_e
 {
    DCT_LUMA_DC     = 0,
    DCT_LUMA_AC     = 1,
    DCT_LUMA_4x4    = 2,
    DCT_CHROMA_DC   = 3,
    DCT_CHROMA_AC   = 4,
    DCT_LUMA_8x8    = 5,
    DCT_CHROMAU_DC  = 6,
    DCT_CHROMAU_AC  = 7,
    DCT_CHROMAU_4x4 = 8,
    DCT_CHROMAU_8x8 = 9,
    DCT_CHROMAV_DC  = 10,
    DCT_CHROMAV_AC  = 11,
    DCT_CHROMAV_4x4 = 12,
    DCT_CHROMAV_8x8 = 13,
 };
 static const uint8_t ctx_cat_plane[6][3] =
 {
    { DCT_LUMA_DC,  DCT_CHROMAU_DC,  DCT_CHROMAV_DC},
    { DCT_LUMA_AC,  DCT_CHROMAU_AC,  DCT_CHROMAV_AC},
    {DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4},
    {0},
    {0},
    {DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8}
 };
 /* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
 #define x264_macroblock_cache_allocate x264_template(macroblock_cache_allocate)
 int  x264_macroblock_cache_allocate( x264_t *h );
 #define x264_macroblock_cache_free x264_template(macroblock_cache_free)
 void x264_macroblock_cache_free( x264_t *h );
 /* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
 #define x264_macroblock_thread_allocate x264_template(macroblock_thread_allocate)
 int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
 #define x264_macroblock_thread_free x264_template(macroblock_thread_free)
 void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
 #define x264_macroblock_slice_init x264_template(macroblock_slice_init)
 void x264_macroblock_slice_init( x264_t *h );
 #define x264_macroblock_thread_init x264_template(macroblock_thread_init)
 void x264_macroblock_thread_init( x264_t *h );
 #define x264_macroblock_cache_load_interlaced x264_template(macroblock_cache_load_interlaced)
 void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y );
 #define x264_macroblock_cache_load_progressive x264_template(macroblock_cache_load_progressive)
 void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y );
 #define x264_macroblock_deblock_strength x264_template(macroblock_deblock_strength)
 void x264_macroblock_deblock_strength( x264_t *h );
 #define x264_macroblock_cache_save x264_template(macroblock_cache_save)
 void x264_macroblock_cache_save( x264_t *h );
 #define x264_macroblock_bipred_init x264_template(macroblock_bipred_init)
 void x264_macroblock_bipred_init( x264_t *h );
 #define x264_prefetch_fenc x264_template(prefetch_fenc)
 void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
 #define x264_copy_column8 x264_template(copy_column8)
 void x264_copy_column8( pixel *dst, pixel *src );
 /* x264_mb_predict_mv_16x16:
 *      set mvp with predicted mv for D_16x16 block
 *      h->mb. need only valid values from other blocks */
 #define x264_mb_predict_mv_16x16 x264_template(mb_predict_mv_16x16)
 void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] );
 /* x264_mb_predict_mv_pskip:
 *      set mvp with predicted mv for P_SKIP
 *      h->mb. need only valid values from other blocks */
 #define x264_mb_predict_mv_pskip x264_template(mb_predict_mv_pskip)
 void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] );
 /* x264_mb_predict_mv:
 *      set mvp with predicted mv for all blocks except SKIP and DIRECT
 *      h->mb. need valid ref/partition/sub of current block to be valid
 *      and valid mv/ref from other blocks. */
 #define x264_mb_predict_mv x264_template(mb_predict_mv)
 void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] );
 /* x264_mb_predict_mv_direct16x16:
 *      set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT
 *      h->mb. need only valid values from other blocks.
 *      return 1 on success, 0 on failure.
 *      if b_changed != NULL, set it to whether refs or mvs differ from
 *      before this functioncall. */
 #define x264_mb_predict_mv_direct16x16 x264_template(mb_predict_mv_direct16x16)
 int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
 /* x264_mb_predict_mv_ref16x16:
 *      set mvc with D_16x16 prediction.
 *      uses all neighbors, even those that didn't end up using this ref.
 *      h->mb. need only valid values from other blocks */
 #define x264_mb_predict_mv_ref16x16 x264_template(mb_predict_mv_ref16x16)
 void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc );
 #define x264_mb_mc x264_template(mb_mc)
 void x264_mb_mc( x264_t *h );
 #define x264_mb_mc_8x8 x264_template(mb_mc_8x8)
 void x264_mb_mc_8x8( x264_t *h, int i8 );
 static ALWAYS_INLINE uint32_t pack16to32( uint32_t a, uint32_t b )
 {
 #if WORDS_BIGENDIAN
   return b + (a<<16);
 #else
   return a + (b<<16);
 #endif
 }
 static ALWAYS_INLINE uint32_t pack8to16( uint32_t a, uint32_t b )
 {
 #if WORDS_BIGENDIAN
   return b + (a<<8);
 #else
   return a + (b<<8);
 #endif
 }
 static ALWAYS_INLINE uint32_t pack8to32( uint32_t a, uint32_t b, uint32_t c, uint32_t d )
 {
 #if WORDS_BIGENDIAN
   return d + (c<<8) + (b<<16) + (a<<24);
 #else
   return a + (b<<8) + (c<<16) + (d<<24);
 #endif
 }
 static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
 {
 #if WORDS_BIGENDIAN
   return (b&0xFFFF) + ((uint32_t)a<<16);
 #else
   return (a&0xFFFF) + ((uint32_t)b<<16);
 #endif
 }
 static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
 {
 #if WORDS_BIGENDIAN
   return b + ((uint64_t)a<<32);
 #else
   return a + ((uint64_t)b<<32);
 #endif
 }
 #if HIGH_BIT_DEPTH
 #   define pack_pixel_1to2 pack16to32
 #   define pack_pixel_2to4 pack32to64
 #else
 #   define pack_pixel_1to2 pack8to16
 #   define pack_pixel_2to4 pack16to32
 #endif
 static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
 {
    const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
    const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8];
    const int m  = X264_MIN( x264_mb_pred_mode4x4_fix(ma),
                             x264_mb_pred_mode4x4_fix(mb) );
    if( m < 0 )
        return I_PRED_4x4_DC;
    return m;
 }
 static ALWAYS_INLINE int x264_mb_predict_non_zero_code( x264_t *h, int idx )
 {
    const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1];
    const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8];
    int i_ret = za + zb;
    if( i_ret < 0x80 )
        i_ret = ( i_ret + 1 ) >> 1;
    return i_ret & 0x7f;
 }
 /* intra and skip are disallowed, p8x8 is conditional. */
 static const uint8_t x264_transform_allowed[X264_MBTYPE_MAX] =
 {
    0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0
 };
 /* x264_mb_transform_8x8_allowed:
 *      check whether any partition is smaller than 8x8 (or at least
 *      might be, according to just partition type.)
 *      doesn't check for cbp */
 static ALWAYS_INLINE int x264_mb_transform_8x8_allowed( x264_t *h )
 {
    if( !h->pps->b_transform_8x8_mode )
        return 0;
    if( h->mb.i_type != P_8x8 )
        return x264_transform_allowed[h->mb.i_type];
    return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
 }
 #endif
--- a/common/mc.c
+++ b/common/mc.c
@@ -0,0 +1,784 @@
 /*****************************************************************************
 * mc.c: motion compensation
 *****************************************************************************
 * Copyright (C) 2003-2025 x264 project
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
 *          Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common.h"
 #if HAVE_MMX
 #include "x86/mc.h"
 #endif
 #if HAVE_ALTIVEC
 #include "ppc/mc.h"
 #endif
 #if HAVE_ARMV6
 #include "arm/mc.h"
 #endif
 #if HAVE_AARCH64
 #include "aarch64/mc.h"
 #endif
 #if HAVE_MSA
 #include "mips/mc.h"
 #endif
 #if HAVE_LSX
 #   include "loongarch/mc.h"
 #endif
 static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
                              pixel *src1, intptr_t i_src1_stride,
                              pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
 {
    for( int y = 0; y < i_height; y++ )
    {
        for( int x = 0; x < i_width; x++ )
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        dst  += i_dst_stride;
        src1 += i_src1_stride;
        src2 += i_src2_stride;
    }
 }
 static inline void pixel_avg_wxh( pixel *dst,  intptr_t i_dst,
                                  pixel *src1, intptr_t i_src1,
                                  pixel *src2, intptr_t i_src2, int width, int height )
 {
    for( int y = 0; y < height; y++ )
    {
        for( int x = 0; x < width; x++ )
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        src1 += i_src1;
        src2 += i_src2;
        dst += i_dst;
    }
 }
 /* Implicit weighted bipred only:
 * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
 static inline void pixel_avg_weight_wxh( pixel *dst,  intptr_t i_dst,
                                         pixel *src1, intptr_t i_src1,
                                         pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
 {
    int i_weight2 = 64 - i_weight1;
    for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
        for( int x = 0; x<width; x++ )
            dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
 }
 #undef op_scale2
 #define PIXEL_AVG_C( name, width, height ) \
 static void name( pixel *pix1, intptr_t i_stride_pix1, \
                  pixel *pix2, intptr_t i_stride_pix2, \
                  pixel *pix3, intptr_t i_stride_pix3, int weight ) \
 { \
    if( weight == 32 ) \
        pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
    else \
        pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
 }
 PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
 PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
 PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
 PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
 PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
 PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
 PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
 PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
 PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
 PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
 PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
 PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
 static void weight_cache( x264_t *h, x264_weight_t *w )
 {
    w->weightfn = h->mc.weight;
 }
 #define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
 #define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
 static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
                       const x264_weight_t *weight, int i_width, int i_height )
 {
    int offset = weight->i_offset * (1 << (BIT_DEPTH-8));
    int scale = weight->i_scale;
    int denom = weight->i_denom;
    if( denom >= 1 )
    {
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
                opscale( x );
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
            for( int x = 0; x < i_width; x++ )
                opscale_noden( x );
    }
 }
 #define MC_WEIGHT_C( name, width ) \
 static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
 { \
    mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
 }
 MC_WEIGHT_C( mc_weight_w20, 20 )
 MC_WEIGHT_C( mc_weight_w16, 16 )
 MC_WEIGHT_C( mc_weight_w12, 12 )
 MC_WEIGHT_C( mc_weight_w8,   8 )
 MC_WEIGHT_C( mc_weight_w4,   4 )
 MC_WEIGHT_C( mc_weight_w2,   2 )
 static weight_fn_t mc_weight_wtab[6] =
 {
    mc_weight_w2,
    mc_weight_w4,
    mc_weight_w8,
    mc_weight_w12,
    mc_weight_w16,
    mc_weight_w20,
 };
 static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
 {
    for( int y = 0; y < i_height; y++ )
    {
        memcpy( dst, src, i_width * SIZEOF_PIXEL );
        src += i_src_stride;
        dst += i_dst_stride;
    }
 }
 #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
 static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
                         intptr_t stride, int width, int height, int16_t *buf )
 {
    const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
    for( int y = 0; y < height; y++ )
    {
        for( int x = -2; x < width+3; x++ )
        {
            int v = TAPFILTER(src,stride);
            dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
            /* transform v for storage in a 16-bit integer */
            buf[x+2] = v + pad;
        }
        for( int x = 0; x < width; x++ )
            dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
        for( int x = 0; x < width; x++ )
            dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
        dsth += stride;
        dstv += stride;
        dstc += stride;
        src += stride;
    }
 }
 static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
                     pixel *src[4], intptr_t i_src_stride,
                     int mvx, int mvy,
                     int i_width, int i_height, const x264_weight_t *weight )
 {
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
        if( weight->weightfn )
            mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
    }
    else if( weight->weightfn )
        mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
    else
        mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
 }
 static pixel *get_ref( pixel *dst,   intptr_t *i_dst_stride,
                       pixel *src[4], intptr_t i_src_stride,
                       int mvx, int mvy,
                       int i_width, int i_height, const x264_weight_t *weight )
 {
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
    if( qpel_idx & 5 ) /* qpel interpolation needed */
    {
        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
        pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                   src2, i_src_stride, i_width, i_height );
        if( weight->weightfn )
            mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
 }
 /* full chroma mc (ie until 1/8 pixel)*/
 static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
                       pixel *src, intptr_t i_src_stride,
                       int mvx, int mvy,
                       int i_width, int i_height )
 {
    pixel *srcp;
    int d8x = mvx&0x07;
    int d8y = mvy&0x07;
    int cA = (8-d8x)*(8-d8y);
    int cB = d8x    *(8-d8y);
    int cC = (8-d8x)*d8y;
    int cD = d8x    *d8y;
    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
    srcp = &src[i_src_stride];
    for( int y = 0; y < i_height; y++ )
    {
        for( int x = 0; x < i_width; x++ )
        {
            dstu[x] = ( cA*src[2*x]  + cB*src[2*x+2] +
                        cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
            dstv[x] = ( cA*src[2*x+1]  + cB*src[2*x+3] +
                        cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
        }
        dstu += i_dst_stride;
        dstv += i_dst_stride;
        src   = srcp;
        srcp += i_src_stride;
    }
 }
 #define MC_COPY(W) \
 static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
 { \
    mc_copy( src, i_src, dst, i_dst, W, i_height ); \
 }
 MC_COPY( 16 )
 MC_COPY( 8 )
 MC_COPY( 4 )
 void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
                        pixel *src, intptr_t i_src, int w, int h )
 {
    while( h-- )
    {
        memcpy( dst, src, w * SIZEOF_PIXEL );
        dst += i_dst;
        src += i_src;
    }
 }
 void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
                             pixel *src, intptr_t i_src, int w, int h )
 {
    for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
        for( int x=0; x<2*w; x+=2 )
        {
            dst[x]   = src[x+1];
            dst[x+1] = src[x];
        }
 }
 void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
                                   pixel *srcu, intptr_t i_srcu,
                                   pixel *srcv, intptr_t i_srcv, int w, int h )
 {
    for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
        for( int x=0; x<w; x++ )
        {
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
        }
 }
 void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
                                     pixel *src,  intptr_t i_src, int w, int h )
 {
    for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, src+=i_src )
        for( int x=0; x<w; x++ )
        {
            dsta[x] = src[2*x];
            dstb[x] = src[2*x+1];
        }
 }
 static void plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
                                           pixel *dstb, intptr_t i_dstb,
                                           pixel *dstc, intptr_t i_dstc,
                                           pixel *src,  intptr_t i_src, int pw, int w, int h )
 {
    for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
    {
        for( int x=0; x<w; x++ )
        {
            dsta[x] = src[x*pw];
            dstb[x] = src[x*pw+1];
            dstc[x] = src[x*pw+2];
        }
    }
 }
 #if WORDS_BIGENDIAN
 static ALWAYS_INLINE uint32_t v210_endian_fix32( uint32_t x )
 {
    return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
 }
 #else
 #define v210_endian_fix32(x) (x)
 #endif
 static void plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
                                            pixel *dstc, intptr_t i_dstc,
                                            uint32_t *src, intptr_t i_src, int w, int h )
 {
    for( int l = 0; l < h; l++ )
    {
        pixel *dsty0 = dsty;
        pixel *dstc0 = dstc;
        uint32_t *src0 = src;
        for( int n = 0; n < w; n += 3 )
        {
            uint32_t s = v210_endian_fix32( *src0++ );
            *dstc0++ = s & 0x03FF;
            *dsty0++ = (s >> 10) & 0x03FF;
            *dstc0++ = (s >> 20) & 0x03FF;
            s = v210_endian_fix32( *src0++ );
            *dsty0++ = s & 0x03FF;
            *dstc0++ = (s >> 10) & 0x03FF;
            *dsty0++ = (s >> 20) & 0x03FF;
        }
        dsty += i_dsty;
        dstc += i_dstc;
        src  += i_src;
    }
 }
 static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
 {
    for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
        for( int x=0; x<8; x++ )
        {
            dst[2*x]   = srcu[x];
            dst[2*x+1] = srcv[x];
        }
 }
 static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
 {
    x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
 }
 static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
 {
    x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
 }
 static void prefetch_fenc_null( pixel *pix_y,  intptr_t stride_y,
                                pixel *pix_uv, intptr_t stride_uv, int mb_x )
 {}
 static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
 {}
 static void memzero_aligned( void * dst, size_t n )
 {
    memset( dst, 0, n );
 }
 static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
 {
    int v = pix[0]+pix[1]+pix[2]+pix[3];
    for( int x = 0; x < stride-4; x++ )
    {
        sum[x] = (uint16_t)(v + sum[x-stride]);
        v += pix[x+4] - pix[x];
    }
 }
 static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
 {
    int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
    for( int x = 0; x < stride-8; x++ )
    {
        sum[x] = (uint16_t)(v + sum[x-stride]);
        v += pix[x+8] - pix[x];
    }
 }
 static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
 {
    for( int x = 0; x < stride-8; x++ )
        sum4[x] = (uint16_t)(sum8[x+4*stride] - sum8[x]);
    for( int x = 0; x < stride-8; x++ )
        sum8[x] = (uint16_t)(sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4]);
 }
 static void integral_init8v( uint16_t *sum8, intptr_t stride )
 {
    for( int x = 0; x < stride-8; x++ )
        sum8[x] = (uint16_t)(sum8[x+8*stride] - sum8[x]);
 }
 void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
 {
    pixel *src = frame->plane[0];
    int i_stride = frame->i_stride[0];
    int i_height = frame->i_lines[0];
    int i_width  = frame->i_width[0];
    // duplicate last row and column so that their interpolation doesn't have to be special-cased
    for( int y = 0; y < i_height; y++ )
        src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
    memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * SIZEOF_PIXEL );
    h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
                                  i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
    x264_frame_expand_border_lowres( frame );
    memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );
    for( int y = 0; y < h->param.i_bframe + 2; y++ )
        for( int x = 0; x < h->param.i_bframe + 2; x++ )
            frame->i_row_satds[y][x][0] = -1;
    for( int y = 0; y <= !!h->param.i_bframe; y++ )
        for( int x = 0; x <= h->param.i_bframe; x++ )
            frame->lowres_mvs[y][x][0][0] = 0x7FFF;
 }
 static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
                                    intptr_t src_stride, intptr_t dst_stride, int width, int height )
 {
    for( int y = 0; y < height; y++ )
    {
        pixel *src1 = src0+src_stride;
        pixel *src2 = src1+src_stride;
        for( int x = 0; x<width; x++ )
        {
            // slower than naive bilinear, but matches asm
 #define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
            dst0[x] = FILTER(src0[2*x  ], src1[2*x  ], src0[2*x+1], src1[2*x+1]);
            dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
            dstv[x] = FILTER(src1[2*x  ], src2[2*x  ], src1[2*x+1], src2[2*x+1]);
            dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
 #undef FILTER
        }
        src0 += src_stride*2;
        dst0 += dst_stride;
        dsth += dst_stride;
        dstv += dst_stride;
        dstc += dst_stride;
    }
 }
 /* Estimate the total amount of influence on future quality that could be had if we
 * were to improve the reference samples used to inter predict any given macroblock. */
 static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 {
    float fps = *fps_factor;
    for( int i = 0; i < len; i++ )
    {
        int intra_cost = intra_costs[i];
        int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
        float propagate_intra  = intra_cost * inv_qscales[i];
        float propagate_amount = propagate_in[i] + propagate_intra*fps;
        float propagate_num    = intra_cost - inter_cost;
        float propagate_denom  = intra_cost;
        dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
    }
 }
 static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
                                   int16_t *propagate_amount, uint16_t *lowres_costs,
                                   int bipred_weight, int mb_y, int len, int list )
 {
    unsigned stride = h->mb.i_mb_stride;
    unsigned width = h->mb.i_mb_width;
    unsigned height = h->mb.i_mb_height;
    for( int i = 0; i < len; i++ )
    {
        int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
        if( !(lists_used & (1 << list)) )
            continue;
        int listamount = propagate_amount[i];
        /* Apply bipred weighting. */
        if( lists_used == 3 )
            listamount = (listamount * bipred_weight + 32) >> 6;
        /* Early termination for simple case of mv0. */
        if( !M32( mvs[i] ) )
        {
            MC_CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
            continue;
        }
        int x = mvs[i][0];
        int y = mvs[i][1];
        unsigned mbx = (unsigned)((x>>5)+i);
        unsigned mby = (unsigned)((y>>5)+mb_y);
        unsigned idx0 = mbx + mby * stride;
        unsigned idx2 = idx0 + stride;
        x &= 31;
        y &= 31;
        int idx0weight = (32-y)*(32-x);
        int idx1weight = (32-y)*x;
        int idx2weight = y*(32-x);
        int idx3weight = y*x;
        idx0weight = (idx0weight * listamount + 512) >> 10;
        idx1weight = (idx1weight * listamount + 512) >> 10;
        idx2weight = (idx2weight * listamount + 512) >> 10;
        idx3weight = (idx3weight * listamount + 512) >> 10;
        if( mbx < width-1 && mby < height-1 )
        {
            MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
            MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
            MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
            MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
        }
        else
        {
            /* Note: this takes advantage of unsigned representation to
             * catch negative mbx/mby. */
            if( mby < height )
            {
                if( mbx < width )
                    MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
                if( mbx+1 < width )
                    MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
            }
            if( mby+1 < height )
            {
                if( mbx < width )
                    MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
                if( mbx+1 < width )
                    MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
            }
        }
    }
 }
 /* Conversion between float and Q8.8 fixed point (big-endian) for storage */
 static void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
 {
    for( int i = 0; i < count; i++ )
        dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) );
 }
 static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
 {
    for( int i = 0; i < count; i++ )
        dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f);
 }
 void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent )
 {
    pf->mc_luma   = mc_luma;
    pf->get_ref   = get_ref;
    pf->mc_chroma = mc_chroma;
    pf->avg[PIXEL_16x16]= pixel_avg_16x16;
    pf->avg[PIXEL_16x8] = pixel_avg_16x8;
    pf->avg[PIXEL_8x16] = pixel_avg_8x16;
    pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
    pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
    pf->avg[PIXEL_4x16] = pixel_avg_4x16;
    pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
    pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
    pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
    pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
    pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
    pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
    pf->weight    = mc_weight_wtab;
    pf->offsetadd = mc_weight_wtab;
    pf->offsetsub = mc_weight_wtab;
    pf->weight_cache = weight_cache;
    pf->copy_16x16_unaligned = mc_copy_w16;
    pf->copy[PIXEL_16x16] = mc_copy_w16;
    pf->copy[PIXEL_8x8]   = mc_copy_w8;
    pf->copy[PIXEL_4x4]   = mc_copy_w4;
    pf->store_interleave_chroma       = store_interleave_chroma;
    pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
    pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
    pf->plane_copy = x264_plane_copy_c;
    pf->plane_copy_swap = x264_plane_copy_swap_c;
    pf->plane_copy_interleave = x264_plane_copy_interleave_c;
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
    pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c;
    pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_c;
    pf->plane_copy_deinterleave_v210 = plane_copy_deinterleave_v210_c;
    pf->hpel_filter = hpel_filter;
    pf->prefetch_fenc_400 = prefetch_fenc_null;
    pf->prefetch_fenc_420 = prefetch_fenc_null;
    pf->prefetch_fenc_422 = prefetch_fenc_null;
    pf->prefetch_ref  = prefetch_ref_null;
    pf->memcpy_aligned = memcpy;
    pf->memzero_aligned = memzero_aligned;
    pf->frame_init_lowres_core = frame_init_lowres_core;
    pf->integral_init4h = integral_init4h;
    pf->integral_init8h = integral_init8h;
    pf->integral_init4v = integral_init4v;
    pf->integral_init8v = integral_init8v;
    pf->mbtree_propagate_cost = mbtree_propagate_cost;
    pf->mbtree_propagate_list = mbtree_propagate_list;
    pf->mbtree_fix8_pack      = mbtree_fix8_pack;
    pf->mbtree_fix8_unpack    = mbtree_fix8_unpack;
 #if HAVE_MMX
    x264_mc_init_mmx( cpu, pf );
 #endif
 #if HAVE_ALTIVEC
    if( cpu&X264_CPU_ALTIVEC )
        x264_mc_init_altivec( pf );
 #endif
 #if HAVE_ARMV6
    x264_mc_init_arm( cpu, pf );
 #endif
 #if HAVE_AARCH64
    x264_mc_init_aarch64( cpu, pf );
 #endif
 #if HAVE_MSA
    if( cpu&X264_CPU_MSA )
        x264_mc_init_mips( cpu, pf );
 #endif
 #if HAVE_LSX
    x264_mc_init_loongarch( cpu, pf );
 #endif
    if( cpu_independent )
    {
        pf->mbtree_propagate_cost = mbtree_propagate_cost;
        pf->mbtree_propagate_list = mbtree_propagate_list;
    }
 }
 void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 {
    const int b_interlaced = PARAM_INTERLACED;
    int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
    int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
    if( mb_y & b_interlaced )
        return;
    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
    {
        int stride = frame->i_stride[p];
        const int width = frame->i_width[p];
        int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
        if( !b_interlaced || h->mb.b_adaptive_mbaff )
            h->mc.hpel_filter(
                frame->filtered[p][1] + offs,
                frame->filtered[p][2] + offs,
                frame->filtered[p][3] + offs,
                frame->plane[p] + offs,
                stride, width + 16, height - start,
                h->scratch_buffer );
        if( b_interlaced )
        {
            /* MC must happen between pixels in the same field. */
            stride = frame->i_stride[p] << 1;
            start = (mb_y*16 >> 1) - 8;
            int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
            offs = start*stride - 8;
            for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
            {
                h->mc.hpel_filter(
                    frame->filtered_fld[p][1] + offs,
                    frame->filtered_fld[p][2] + offs,
                    frame->filtered_fld[p][3] + offs,
                    frame->plane_fld[p] + offs,
                    stride, width + 16, height_fld - start,
                    h->scratch_buffer );
            }
        }
    }
    /* generate integral image:
     * frame->integral contains 2 planes. in the upper plane, each element is
     * the sum of an 8x8 pixel region with top-left corner on that point.
     * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
    if( frame->integral )
    {
        int stride = frame->i_stride[0];
        if( start < 0 )
        {
            memset( frame->integral - PADV * stride - PADH_ALIGN, 0, stride * sizeof(uint16_t) );
            start = -PADV;
        }
        if( b_end )
            height += PADV-9;
        for( int y = start; y < height; y++ )
        {
            pixel    *pix  = frame->plane[0] + y * stride - PADH_ALIGN;
            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH_ALIGN;
            uint16_t *sum4;
            if( h->frames.b_have_sub8x8_esa )
            {
                h->mc.integral_init4h( sum8, pix, stride );
                sum8 -= 8*stride;
                sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
                if( y >= 8-PADV )
                    h->mc.integral_init4v( sum8, sum4, stride );
            }
            else
            {
                h->mc.integral_init8h( sum8, pix, stride );
                if( y >= 8-PADV )
                    h->mc.integral_init8v( sum8-8*stride, stride );
            }
        }
    }
 }
--- a/common/mc.h
+++ b/common/mc.h
@@ -0,0 +1,345 @@
 /*****************************************************************************
 * mc.h: motion compensation
 *****************************************************************************
 * Copyright (C) 2004-2025 x264 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_MC_H
 #define X264_MC_H
 #define MC_CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
 #define MC_CLIP_ADD2(s,x)\
 do\
 {\
    MC_CLIP_ADD((s)[0], (x)[0]);\
    MC_CLIP_ADD((s)[1], (x)[1]);\
 } while( 0 )
 #define x264_mbtree_propagate_list_internal_neon x264_template(mbtree_propagate_list_internal_neon)
 #define PROPAGATE_LIST(cpu)\
 void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
                                                uint16_t *lowres_costs, int16_t *output,\
                                                int bipred_weight, int mb_y, int len );\
 \
 static void mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
                                         int16_t *propagate_amount, uint16_t *lowres_costs,\
                                         int bipred_weight, int mb_y, int len, int list )\
 {\
    int16_t *current = h->scratch_buffer2;\
 \
    x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
                                               current, bipred_weight, mb_y, len );\
 \
    unsigned stride = h->mb.i_mb_stride;\
    unsigned width = h->mb.i_mb_width;\
    unsigned height = h->mb.i_mb_height;\
 \
    for( int i = 0; i < len; current += 32 )\
    {\
        int end = X264_MIN( i+8, len );\
        for( ; i < end; i++, current += 2 )\
        {\
            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
                continue;\
 \
            unsigned mbx = (unsigned)current[0];\
            unsigned mby = (unsigned)current[1];\
            unsigned idx0 = mbx + mby * stride;\
            unsigned idx2 = idx0 + stride;\
 \
            /* Shortcut for the simple/common case of zero MV */\
            if( !M32( mvs[i] ) )\
            {\
                MC_CLIP_ADD( ref_costs[idx0], current[16] );\
                continue;\
            }\
 \
            if( mbx < width-1 && mby < height-1 )\
            {\
                MC_CLIP_ADD2( ref_costs+idx0, current+16 );\
                MC_CLIP_ADD2( ref_costs+idx2, current+32 );\
            }\
            else\
            {\
                /* Note: this takes advantage of unsigned representation to\
                 * catch negative mbx/mby. */\
                if( mby < height )\
                {\
                    if( mbx < width )\
                        MC_CLIP_ADD( ref_costs[idx0+0], current[16] );\
                    if( mbx+1 < width )\
                        MC_CLIP_ADD( ref_costs[idx0+1], current[17] );\
                }\
                if( mby+1 < height )\
                {\
                    if( mbx < width )\
                        MC_CLIP_ADD( ref_costs[idx2+0], current[32] );\
                    if( mbx+1 < width )\
                        MC_CLIP_ADD( ref_costs[idx2+1], current[33] );\
                }\
            }\
        }\
    }\
 }
 #define x264_plane_copy_c x264_template(plane_copy_c)
 void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 #define PLANE_COPY(align, cpu)\
 static void plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
 {\
    int c_w = (align) / SIZEOF_PIXEL - 1;\
    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
    else if( !(w&c_w) )\
        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
    else\
    {\
        if( --h > 0 )\
        {\
            if( i_src > 0 )\
            {\
                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
                dst += i_dst * h;\
                src += i_src * h;\
            }\
            else\
                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
        }\
        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
        memcpy( dst, src, w*SIZEOF_PIXEL );\
    }\
 }
 #define x264_plane_copy_swap_c x264_template(plane_copy_swap_c)
 void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 #define PLANE_COPY_SWAP(align, cpu)\
 static void plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
 {\
    int c_w = (align>>1) / SIZEOF_PIXEL - 1;\
    if( !(w&c_w) )\
        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
    else if( w > c_w )\
    {\
        if( --h > 0 )\
        {\
            if( i_src > 0 )\
            {\
                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
                dst += i_dst * h;\
                src += i_src * h;\
            }\
            else\
                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
        }\
        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
        {\
            dst[x]   = src[x+1];\
            dst[x+1] = src[x];\
        }\
    }\
    else\
        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
 }
 #define x264_plane_copy_deinterleave_c x264_template(plane_copy_deinterleave_c)
 void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
                                     pixel *src, intptr_t i_src, int w, int h );
 /* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV
 * input with the additional constraint that we cannot overread src. */
 #define PLANE_COPY_YUYV(align, cpu)\
 static void plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\
                                                pixel *src, intptr_t i_src, int w, int h )\
 {\
    int c_w = (align>>1) / SIZEOF_PIXEL - 1;\
    if( !(w&c_w) )\
        x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
    else if( w > c_w )\
    {\
        if( --h > 0 )\
        {\
            if( i_src > 0 )\
            {\
                x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
                dsta += i_dsta * h;\
                dstb += i_dstb * h;\
                src  += i_src  * h;\
            }\
            else\
                x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\
                                                    src+i_src, i_src, w, h );\
        }\
        x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\
    }\
    else\
        x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
 }
 #define x264_plane_copy_interleave_c x264_template(plane_copy_interleave_c)
 void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
                                   pixel *srcu, intptr_t i_srcu,
                                   pixel *srcv, intptr_t i_srcv, int w, int h );
 #define PLANE_INTERLEAVE(cpu) \
 static void plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
                                         pixel *srcu, intptr_t i_srcu,\
                                         pixel *srcv, intptr_t i_srcv, int w, int h )\
 {\
    int c_w = 16 / SIZEOF_PIXEL - 1;\
    if( !(w&c_w) )\
        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
    {\
        if( --h > 0 )\
        {\
            if( i_srcu > 0 )\
            {\
                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
                dst  += i_dst  * h;\
                srcu += i_srcu * h;\
                srcv += i_srcv * h;\
            }\
            else\
                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
        }\
        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
    }\
    else\
        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
 }
 struct x264_weight_t;
 typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
 typedef struct x264_weight_t
 {
    /* aligning the first member is a gcc hack to force the struct to be
     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
    ALIGNED_16( int16_t cachea[8] );
    int16_t cacheb[8];
    int32_t i_denom;
    int32_t i_scale;
    int32_t i_offset;
    weight_fn_t *weightfn;
 } ALIGNED_16( x264_weight_t );
 #define x264_weight_none ((const x264_weight_t*)x264_zero)
 #define SET_WEIGHT( w, b, s, d, o )\
 {\
    (w).i_scale = (s);\
    (w).i_denom = (d);\
    (w).i_offset = (o);\
    if( b )\
        h->mc.weight_cache( h, &w );\
    else\
        w.weightfn = NULL;\
 }
 /* Do the MC
 * XXX: Only width = 4, 8 or 16 are valid
 * width == 4 -> height == 4 or 8
 * width == 8 -> height == 4 or 8 or 16
 * width == 16-> height == 8 or 16
 * */
 typedef struct
 {
    void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src,
                     int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
    /* may round up the dimensions if they're not a power of 2 */
    pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src,
                       int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
    /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
     * so it must be run from left to right. */
    void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,
                       int mvx, int mvy, int i_width, int i_height );
    void (*avg[12])( pixel *dst,  intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
                     pixel *src2, intptr_t src2_stride, int i_weight );
    /* only 16x16, 8x8, and 4x4 defined */
    void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
    void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
    void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
    void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height );
    void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
    void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
    void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
    void (*plane_copy_interleave)( pixel *dst,  intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
                                   pixel *srcv, intptr_t i_srcv, int w, int h );
    /* may write up to 15 pixels off the end of each plane */
    void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
                                     pixel *src,  intptr_t i_src, int w, int h );
    void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
                                          pixel *src,  intptr_t i_src, int w, int h );
    void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
                                         pixel *dstc, intptr_t i_dstc, pixel *src,  intptr_t i_src, int pw, int w, int h );
    void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
                                          pixel *dstc, intptr_t i_dstc,
                                          uint32_t *src, intptr_t i_src, int w, int h );
    void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
                         intptr_t i_stride, int i_width, int i_height, int16_t *buf );
    /* prefetch the next few macroblocks of fenc or fdec */
    void (*prefetch_fenc)    ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
    void (*prefetch_fenc_400)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
    void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
    void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
    /* prefetch the next few macroblocks of a hpel reference frame */
    void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity );
    void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
    void (*memzero_aligned)( void *dst, size_t n );
    /* successive elimination prefilter */
    void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride );
    void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride );
    void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
    void (*integral_init8v)( uint16_t *sum8, intptr_t stride );
    void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
                                    intptr_t src_stride, intptr_t dst_stride, int width, int height );
    weight_fn_t *weight;
    weight_fn_t *offsetadd;
    weight_fn_t *offsetsub;
    void (*weight_cache)( x264_t *, x264_weight_t * );
    void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
    void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
                                   int16_t *propagate_amount, uint16_t *lowres_costs,
                                   int bipred_weight, int mb_y, int len, int list );
    void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count );
    void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count );
 } x264_mc_functions_t;
 #define x264_mc_init x264_template(mc_init)
 void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent );
 #endif
--- a/common/mips/dct-c.c
+++ b/common/mips/dct-c.c
@@ -0,0 +1,526 @@
 /*****************************************************************************
 * dct-c.c: msa transform and zigzag
 *****************************************************************************
 * Copyright (C) 2015-2025 x264 project
 *
 * Authors: Rishikesh More <rishikesh.more@imgtec.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "macros.h"
 #include "dct.h"
 #if !HIGH_BIT_DEPTH
 #define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 )          \
 {                                                                           \
    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
                                                                            \
    tmp0_m = in0 + in2;                                                     \
    tmp1_m = in0 - in2;                                                     \
    tmp2_m = in1 >> 1;                                                      \
    tmp2_m = tmp2_m - in3;                                                  \
    tmp3_m = in3 >> 1;                                                      \
    tmp3_m = in1 + tmp3_m;                                                  \
                                                                            \
    BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 );  \
 }
 static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
                              int32_t i_src_stride )
 {
    v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
    v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
    v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
    v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
    UNPCK_R_SH_SW( src0, src0_r );
    UNPCK_R_SH_SW( src1, src1_r );
    UNPCK_R_SH_SW( src2, src2_r );
    UNPCK_R_SH_SW( src3, src3_r );
    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
                 tmp0, tmp3, tmp2, tmp1 );
    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
                 hor_res0, hor_res3, hor_res2, hor_res1 );
    TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
                        hor_res0, hor_res1, hor_res2, hor_res3 );
    BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
                 tmp0, tmp3, tmp2, tmp1 );
    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
                 ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
    SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
    PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
                 ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
                 ver_res0, ver_res1, ver_res2, ver_res3 );
    PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
    ST_SH2( ver_res0, ver_res2, p_dst, 8 );
 }
 static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
                                uint8_t *p_ref, int32_t i_dst_stride,
                                int16_t *p_dst )
 {
    uint32_t i_src0, i_src1, i_src2, i_src3;
    uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
    v16i8 src = { 0 };
    v16i8 ref = { 0 };
    v16u8 inp0, inp1;
    v8i16 diff0, diff1, diff2, diff3;
    v8i16 temp0, temp1, temp2, temp3;
    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
    LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
    INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
    ILVRL_B2_UB( src, ref, inp0, inp1 );
    HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
    diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
    diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
    BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
    diff0 = temp0 + temp1;
    diff1 = ( temp3 << 1 ) + temp2;
    diff2 = temp0 - temp1;
    diff3 = temp3 - ( temp2 << 1 );
    TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
                        temp0, temp1, temp2, temp3 );
    BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
    temp0 = diff0 + diff1;
    temp1 = ( diff3 << 1 ) + diff2;
    temp2 = diff0 - diff1;
    temp3 = diff3 - ( diff2 << 1 );
    ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
    ST_UB2( inp0, inp1, p_dst, 8 );
 }
 static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
                                           int16_t pi_level[16] )
 {
    v8i16 src0, src1;
    v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
    v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
    LD_SH2( pi_dct, 8, src0, src1 );
    VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
    ST_SH2( mask0, mask1, pi_level, 8 );
 }
 static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
                                    int32_t i_dst_stride )
 {
    v8i16 src0, src1, src2, src3;
    v8i16 hres0, hres1, hres2, hres3;
    v8i16 vres0, vres1, vres2, vres3;
    v8i16 zeros = { 0 };
    LD4x4_SH( p_src, src0, src1, src2, src3 );
    AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
    TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
                        hres0, hres1, hres2, hres3 );
    AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
    SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
    ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
    ST_SH2( zeros, zeros, p_src, 8 );
 }
 static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
                                       int32_t i_dst_stride )
 {
    int16_t i_dc;
    uint32_t i_src0, i_src1, i_src2, i_src3;
    v16u8 pred = { 0 };
    v16i8 out;
    v8i16 input_dc, pred_r, pred_l;
    i_dc = ( p_src[0] + 32 ) >> 6;
    input_dc = __msa_fill_h( i_dc );
    p_src[ 0 ] = 0;
    LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
    INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
    UNPCK_UB_SH( pred, pred_r, pred_l );
    pred_r += input_dc;
    pred_l += input_dc;
    CLIP_SH2_0_255( pred_r, pred_l );
    out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
    ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
 }
 static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
                                  int32_t i_dst_stride )
 {
    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
    v8i16 vec0, vec1, vec2, vec3;
    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
    v16i8 zeros = { 0 };
    p_src[ 0 ] += 32;
    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
    vec0 = src0 + src4;
    vec1 = src0 - src4;
    vec2 = src2 >> 1;
    vec2 = vec2 - src6;
    vec3 = src6 >> 1;
    vec3 = src2 + vec3;
    BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
    vec0 = src7 >> 1;
    vec0 = src5 - vec0 - src3 - src7;
    vec1 = src3 >> 1;
    vec1 = src1 - vec1 + src7 - src3;
    vec2 = src5 >> 1;
    vec2 = vec2 - src1 + src7 + src5;
    vec3 = src1 >> 1;
    vec3 = vec3 + src3 + src5 + src1;
    tmp4 = vec3 >> 2;
    tmp4 += vec0;
    tmp5 = vec2 >> 2;
    tmp5 += vec1;
    tmp6 = vec1 >> 2;
    tmp6 -= vec2;
    tmp7 = vec0 >> 2;
    tmp7 = vec3 - tmp7;
    BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
                 res0, res1, res2, res3, res4, res5, res6, res7 );
    TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
                        res0, res1, res2, res3, res4, res5, res6, res7 );
    UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
    UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
    UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
    UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
    UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
    UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
    UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
    UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
    BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
                 vec0_r, vec0_l, vec1_l, vec1_r );
    vec2_r = tmp2_r >> 1;
    vec2_l = tmp2_l >> 1;
    vec2_r -= tmp6_r;
    vec2_l -= tmp6_l;
    vec3_r = tmp6_r >> 1;
    vec3_l = tmp6_l >> 1;
    vec3_r += tmp2_r;
    vec3_l += tmp2_l;
    BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
                 tmp0_r, tmp2_r, tmp4_r, tmp6_r );
    BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
                 tmp0_l, tmp2_l, tmp4_l, tmp6_l );
    vec0_r = tmp7_r >> 1;
    vec0_l = tmp7_l >> 1;
    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
    vec1_r = tmp3_r >> 1;
    vec1_l = tmp3_l >> 1;
    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
    vec2_r = tmp5_r >> 1;
    vec2_l = tmp5_l >> 1;
    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
    vec3_r = tmp1_r >> 1;
    vec3_l = tmp1_l >> 1;
    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
    tmp1_r = vec3_r >> 2;
    tmp1_l = vec3_l >> 2;
    tmp1_r += vec0_r;
    tmp1_l += vec0_l;
    tmp3_r = vec2_r >> 2;
    tmp3_l = vec2_l >> 2;
    tmp3_r += vec1_r;
    tmp3_l += vec1_l;
    tmp5_r = vec1_r >> 2;
    tmp5_l = vec1_l >> 2;
    tmp5_r -= vec2_r;
    tmp5_l -= vec2_l;
    tmp7_r = vec0_r >> 2;
    tmp7_l = vec0_l >> 2;
    tmp7_r = vec3_r - tmp7_r;
    tmp7_l = vec3_l - tmp7_l;
    BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
                 res0_r, res0_l, res7_l, res7_r );
    BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
                 res1_r, res1_l, res6_l, res6_r );
    BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
                 res2_r, res2_l, res5_l, res5_r );
    BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
                 res3_r, res3_l, res4_l, res4_r );
    SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
    SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
    SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
    SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
    PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
                 res0, res1, res2, res3 );
    PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
                 res4, res5, res6, res7 );
    LD_SB8( p_dst, i_dst_stride,
            dst0, dst1, dst2, dst3,
            dst4, dst5, dst6, dst7 );
    ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
                tmp0, tmp1, tmp2, tmp3 );
    ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
                tmp4, tmp5, tmp6, tmp7 );
    ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
          res0, res1, res2, res3 );
    ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
          res4, res5, res6, res7 );
    CLIP_SH4_0_255( res0, res1, res2, res3 );
    CLIP_SH4_0_255( res4, res5, res6, res7 );
    PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
                 dst0, dst1, dst2, dst3 );
    ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
 }
 static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
                               int16_t *p_dst, int32_t i_dst_stride )
 {
    v8i16 src0, src1, src2, src3;
    v4i32 src0_r, src1_r, src2_r, src3_r;
    v4i32 hres0, hres1, hres2, hres3;
    v8i16 vres0, vres1, vres2, vres3;
    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    v2i64 res0, res1;
    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
    UNPCK_R_SH_SW( src0, src0_r );
    UNPCK_R_SH_SW( src1, src1_r );
    UNPCK_R_SH_SW( src2, src2_r );
    UNPCK_R_SH_SW( src3, src3_r );
    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
    BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
    TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
                        hres0, hres1, hres2, hres3 );
    BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
    BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
    PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
                 vres0, vres1, vres2, vres3 );
    PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
    ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
 }
 static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
                                    uint8_t *pred_ptr, int32_t i_pred_stride )
 {
    int16_t i_sum;
    uint32_t i_src0, i_src1, i_src2, i_src3;
    uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
    v16i8 src = { 0 };
    v16i8 pred = { 0 };
    v16u8 src_l0, src_l1;
    v8i16 diff0, diff1;
    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
    LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
    INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
    ILVRL_B2_UB( src, pred, src_l0, src_l1 );
    HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
    i_sum = HADD_UH_U32( diff0 + diff1 );
    return i_sum;
 }
 void x264_dct4x4dc_msa( int16_t d[16] )
 {
    avc_dct4x4dc_msa( d, d, 4 );
 }
 void x264_idct4x4dc_msa( int16_t d[16] )
 {
    avc_idct4x4dc_msa( d, 4, d, 4 );
 }
 void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
 {
    avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
 }
 void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
 {
    avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
    avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
    avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
                            &pi_dct[2][0], FDEC_STRIDE );
    avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
                            &pi_dct[3][0], FDEC_STRIDE );
 }
 void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
 {
    x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
    x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
    x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
    x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
 }
 void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
 {
    avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
 }
 void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
 {
    avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
    avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
    avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
                          &pi_dct[2][0], FDEC_STRIDE );
    avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
                          &pi_dct[3][0], FDEC_STRIDE );
 }
 void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
 {
    avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
    avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
    avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
                               &pi_dct[2], FDEC_STRIDE );
    avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
                               &pi_dct[3], FDEC_STRIDE );
 }
 void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
 {
    for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
    {
        avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
        avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
        avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
        avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
    }
 }
 void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
                          uint8_t *p_ref )
 {
    avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
 }
 void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
                          uint8_t *p_ref )
 {
    avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
                        &p_ref[0], FDEC_STRIDE, p_dst[0] );
    avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
                        FDEC_STRIDE, p_dst[1] );
    avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
                        FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
                        FDEC_STRIDE, p_dst[2] );
    avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
                        FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
                        FDEC_STRIDE, p_dst[3] );
 }
 void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
                            uint8_t *p_src,
                            uint8_t *p_ref )
 {
    x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
    x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
    x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
                         &p_ref[8*FDEC_STRIDE+0] );
    x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
                         &p_ref[8*FDEC_STRIDE+8] );
 }
 void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
                             uint8_t *p_pix1, uint8_t *p_pix2 )
 {
    int32_t d0, d1, d2, d3;
    pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
                                     &p_pix2[0], FDEC_STRIDE );
    pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
                                     &p_pix2[4], FDEC_STRIDE );
    pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
                                     &p_pix2[4 * FDEC_STRIDE + 0],
                                     FDEC_STRIDE );
    pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
                                     &p_pix2[4 * FDEC_STRIDE + 4],
                                     FDEC_STRIDE );
    BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
    BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
 }
 void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
                              uint8_t *p_pix1, uint8_t *p_pix2 )
 {
    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
    int32_t b0, b1, b2, b3, b4, b5, b6, b7;
    a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
                              &p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
    a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
                              &p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
    a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
                              &p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
    a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
                              &p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
    a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
                              &p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
    a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
                              &p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
    a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
                              &p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
    a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
                              &p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
    BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
                 b0, b1, b2, b3, b7, b6, b5, b4 );
    BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
                 a0, a1, a2, a3, a7, a6, a5, a4 );
    BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
                 pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
                 pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
 }
 void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
 {
    avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
 }
 #endif
--- a/common/mips/dct.h
+++ b/common/mips/dct.h
@@ -0,0 +1,64 @@
 /*****************************************************************************
 * dct.h: msa transform and zigzag
 *****************************************************************************
 * Copyright (C) 2015-2025 x264 project
 *
 * Authors: Rishikesh More <rishikesh.more@imgtec.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_MIPS_DCT_H
 #define X264_MIPS_DCT_H
 #define x264_dct4x4dc_msa x264_template(dct4x4dc_msa)
 void x264_dct4x4dc_msa( int16_t d[16] );
 #define x264_idct4x4dc_msa x264_template(idct4x4dc_msa)
 void x264_idct4x4dc_msa( int16_t d[16] );
 #define x264_add4x4_idct_msa x264_template(add4x4_idct_msa)
 void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] );
 #define x264_add8x8_idct_msa x264_template(add8x8_idct_msa)
 void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] );
 #define x264_add16x16_idct_msa x264_template(add16x16_idct_msa)
 void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] );
 #define x264_add8x8_idct8_msa x264_template(add8x8_idct8_msa)
 void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] );
 #define x264_add16x16_idct8_msa x264_template(add16x16_idct8_msa)
 void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] );
 #define x264_add8x8_idct_dc_msa x264_template(add8x8_idct_dc_msa)
 void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] );
 #define x264_add16x16_idct_dc_msa x264_template(add16x16_idct_dc_msa)
 void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] );
 #define x264_sub4x4_dct_msa x264_template(sub4x4_dct_msa)
 void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
 #define x264_sub8x8_dct_msa x264_template(sub8x8_dct_msa)
 void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
                          uint8_t *p_ref );
 #define x264_sub16x16_dct_msa x264_template(sub16x16_dct_msa)
 void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src,
                            uint8_t *p_ref );
 #define x264_sub8x8_dct_dc_msa x264_template(sub8x8_dct_dc_msa)
 void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1,
                             uint8_t *p_pix2 );
 #define x264_sub8x16_dct_dc_msa x264_template(sub8x16_dct_dc_msa)
 void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1,
                              uint8_t *p_pix2 );
 #define x264_zigzag_scan_4x4_frame_msa x264_template(zigzag_scan_4x4_frame_msa)
 void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] );
 #endif
--- a/common/mips/deblock-c.c
+++ b/common/mips/deblock-c.c
--- a/common/mips/deblock.h
+++ b/common/mips/deblock.h
@@ -0,0 +1,52 @@
 /*****************************************************************************
 * deblock.h: msa deblocking
 *****************************************************************************
 * Copyright (C) 2017-2025 x264 project
 *
 * Authors: Anton Mitrofanov <BugMaster@narod.ru>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_MIPS_DEBLOCK_H
 #define X264_MIPS_DEBLOCK_H
 #if !HIGH_BIT_DEPTH
 #define x264_deblock_v_luma_msa x264_template(deblock_v_luma_msa)
 void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_luma_msa x264_template(deblock_h_luma_msa)
 void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_v_chroma_msa x264_template(deblock_v_chroma_msa)
 void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_h_chroma_msa x264_template(deblock_h_chroma_msa)
 void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #define x264_deblock_v_luma_intra_msa x264_template(deblock_v_luma_intra_msa)
 void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_luma_intra_msa x264_template(deblock_h_luma_intra_msa)
 void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_v_chroma_intra_msa x264_template(deblock_v_chroma_intra_msa)
 void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_h_chroma_intra_msa x264_template(deblock_h_chroma_intra_msa)
 void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
 #define x264_deblock_strength_msa x264_template(deblock_strength_msa)
 void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
                                int bframe );
 #endif
 #endif
--- a/common/mips/macros.h
+++ b/common/mips/macros.h
--- a/common/mips/mc-c.c
+++ b/common/mips/mc-c.c
--- a/common/mips/mc.h
+++ b/common/mips/mc.h
@@ -0,0 +1,32 @@
 /*****************************************************************************
 * mc.h: msa motion compensation
 *****************************************************************************
 * Copyright (C) 2015-2025 x264 project
 *
 * Authors: Neha Rana <neha.rana@imgtec.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_MIPS_MC_H
 #define X264_MIPS_MC_H
 #define x264_mc_init_mips x264_template(mc_init_mips)
 void x264_mc_init_mips( uint32_t cpu, x264_mc_functions_t *pf );
 #endif
--- a/common/mips/pixel-c.c
+++ b/common/mips/pixel-c.c
--- a/common/mips/pixel.h
+++ b/common/mips/pixel.h
@@ -0,0 +1,228 @@
 /*****************************************************************************
 * pixel.h: msa pixel metrics
 *****************************************************************************
 * Copyright (C) 2015-2025 x264 project
 *
 * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #ifndef X264_MIPS_PIXEL_H
 #define X264_MIPS_PIXEL_H
 #define x264_pixel_sad_16x16_msa x264_template(pixel_sad_16x16_msa)
 int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
                                  uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_16x8_msa x264_template(pixel_sad_16x8_msa)
 int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_8x16_msa x264_template(pixel_sad_8x16_msa)
 int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_8x8_msa x264_template(pixel_sad_8x8_msa)
 int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_8x4_msa x264_template(pixel_sad_8x4_msa)
 int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_4x16_msa x264_template(pixel_sad_4x16_msa)
 int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_4x8_msa x264_template(pixel_sad_4x8_msa)
 int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_4x4_msa x264_template(pixel_sad_4x4_msa)
 int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_sad_x4_16x16_msa x264_template(pixel_sad_x4_16x16_msa)
 void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
                                  uint8_t *p_ref1, uint8_t *p_ref2,
                                  uint8_t *p_ref3, intptr_t i_ref_stride,
                                  int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_16x8_msa x264_template(pixel_sad_x4_16x8_msa)
 void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 uint8_t *p_ref3, intptr_t i_ref_stride,
                                 int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x16_msa x264_template(pixel_sad_x4_8x16_msa)
 void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 uint8_t *p_ref3, intptr_t i_ref_stride,
                                 int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x8_msa x264_template(pixel_sad_x4_8x8_msa)
 void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_8x4_msa x264_template(pixel_sad_x4_8x4_msa)
 void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_4x8_msa x264_template(pixel_sad_x4_4x8_msa)
 void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x4_4x4_msa x264_template(pixel_sad_x4_4x4_msa)
 void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                uint8_t *p_ref3, intptr_t i_ref_stride,
                                int32_t p_sad_array[4] );
 #define x264_pixel_sad_x3_16x16_msa x264_template(pixel_sad_x3_16x16_msa)
 void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
                                  uint8_t *p_ref1, uint8_t *p_ref2,
                                  intptr_t i_ref_stride,
                                  int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_16x8_msa x264_template(pixel_sad_x3_16x8_msa)
 void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 intptr_t i_ref_stride,
                                 int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_8x16_msa x264_template(pixel_sad_x3_8x16_msa)
 void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
                                 uint8_t *p_ref1, uint8_t *p_ref2,
                                 intptr_t i_ref_stride,
                                 int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_8x8_msa x264_template(pixel_sad_x3_8x8_msa)
 void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_8x4_msa x264_template(pixel_sad_x3_8x4_msa)
 void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_4x8_msa x264_template(pixel_sad_x3_4x8_msa)
 void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_sad_x3_4x4_msa x264_template(pixel_sad_x3_4x4_msa)
 void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
                                uint8_t *p_ref1, uint8_t *p_ref2,
                                intptr_t i_ref_stride,
                                int32_t p_sad_array[3] );
 #define x264_pixel_ssd_16x16_msa x264_template(pixel_ssd_16x16_msa)
 int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
                                  uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_16x8_msa x264_template(pixel_ssd_16x8_msa)
 int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x16_msa x264_template(pixel_ssd_8x16_msa)
 int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x8_msa x264_template(pixel_ssd_8x8_msa)
 int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_8x4_msa x264_template(pixel_ssd_8x4_msa)
 int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_4x16_msa x264_template(pixel_ssd_4x16_msa)
 int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
                                 uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_4x8_msa x264_template(pixel_ssd_4x8_msa)
 int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_pixel_ssd_4x4_msa x264_template(pixel_ssd_4x4_msa)
 int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
                                uint8_t *p_ref, intptr_t i_ref_stride );
 #define x264_intra_sad_x3_4x4_msa x264_template(intra_sad_x3_4x4_msa)
 void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
                                int32_t p_sad_array[3] );
 #define x264_intra_sad_x3_16x16_msa x264_template(intra_sad_x3_16x16_msa)
 void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
                                  int32_t p_sad_array[3] );
 #define x264_intra_sad_x3_8x8_msa x264_template(intra_sad_x3_8x8_msa)
 void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
                                int32_t p_sad_array[3] );
 #define x264_intra_sad_x3_8x8c_msa x264_template(intra_sad_x3_8x8c_msa)
 void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
                                 int32_t p_sad_array[3] );
 #define x264_ssim_4x4x2_core_msa x264_template(ssim_4x4x2_core_msa)
 void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
                               const uint8_t *p_pix2, intptr_t i_stride2,
                               int32_t i_sums[2][4] );
 #define x264_pixel_hadamard_ac_8x8_msa x264_template(pixel_hadamard_ac_8x8_msa)
 uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_8x16_msa x264_template(pixel_hadamard_ac_8x16_msa)
 uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_16x8_msa x264_template(pixel_hadamard_ac_16x8_msa)
 uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_hadamard_ac_16x16_msa x264_template(pixel_hadamard_ac_16x16_msa)
 uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_satd_4x4_msa x264_template(pixel_satd_4x4_msa)
 int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_4x8_msa x264_template(pixel_satd_4x8_msa)
 int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_4x16_msa x264_template(pixel_satd_4x16_msa)
 int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x4_msa x264_template(pixel_satd_8x4_msa)
 int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x8_msa x264_template(pixel_satd_8x8_msa)
 int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_8x16_msa x264_template(pixel_satd_8x16_msa)
 int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_16x8_msa x264_template(pixel_satd_16x8_msa)
 int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
                                  uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_satd_16x16_msa x264_template(pixel_satd_16x16_msa)
 int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
                                   uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_sa8d_8x8_msa x264_template(pixel_sa8d_8x8_msa)
 int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
                                 uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_pixel_sa8d_16x16_msa x264_template(pixel_sa8d_16x16_msa)
 int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
                                   uint8_t *p_pix2, intptr_t i_stride2 );
 #define x264_intra_satd_x3_4x4_msa x264_template(intra_satd_x3_4x4_msa)
 void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
                                 int32_t p_sad_array[3] );
 #define x264_intra_satd_x3_16x16_msa x264_template(intra_satd_x3_16x16_msa)
 void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
                                   int32_t p_sad_array[3] );
 #define x264_intra_sa8d_x3_8x8_msa x264_template(intra_sa8d_x3_8x8_msa)
 void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
                                 int32_t p_sad_array[3] );
 #define x264_intra_satd_x3_8x8c_msa x264_template(intra_satd_x3_8x8c_msa)
 void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
                                  int32_t p_sad_array[3] );
 #define x264_pixel_var_16x16_msa x264_template(pixel_var_16x16_msa)
 uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_var_8x16_msa x264_template(pixel_var_8x16_msa)
 uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_var_8x8_msa x264_template(pixel_var_8x8_msa)
 uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
 #define x264_pixel_var2_8x16_msa x264_template(pixel_var2_8x16_msa)
 int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
                                  uint8_t *p_pix2, intptr_t i_stride2,
                                  int32_t *p_ssd );
 #define x264_pixel_var2_8x8_msa x264_template(pixel_var2_8x8_msa)
 int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
                                 uint8_t *p_pix2, intptr_t i_stride2,
                                 int32_t *p_ssd );
 #endif
--- a/common/mips/predict-c.c
+++ b/common/mips/predict-c.c
@@ -0,0 +1,608 @@
 /*****************************************************************************
 * predict-c.c: msa intra prediction
 *****************************************************************************
 * Copyright (C) 2015-2025 x264 project
 *
 * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
 #include "common/common.h"
 #include "macros.h"
 #include "predict.h"
 #if !HIGH_BIT_DEPTH
 static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
                                        int32_t i_dst_stride )
 {
    uint32_t u_src_data;
    u_src_data = LW( p_src );
    SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
 }
 static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
                                        int32_t i_dst_stride )
 {
    uint64_t u_out;
    u_out = LD( p_src );
    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
 }
 static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
                                          int32_t i_dst_stride )
 {
    v16u8 src0 = LD_UB( p_src );
    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
            i_dst_stride );
    p_dst += ( 8 * i_dst_stride );
    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
            i_dst_stride );
 }
 static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
                                         uint8_t *p_dst, int32_t i_dst_stride )
 {
    uint32_t u_out0, u_out1, u_out2, u_out3;
    u_out0 = p_src[0 * i_src_stride] * 0x01010101;
    u_out1 = p_src[1 * i_src_stride] * 0x01010101;
    u_out2 = p_src[2 * i_src_stride] * 0x01010101;
    u_out3 = p_src[3 * i_src_stride] * 0x01010101;
    SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
 }
 static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
                                         uint8_t *p_dst, int32_t i_dst_stride )
 {
    uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
    u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
    u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
    u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
    u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
    u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
    u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
    u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
    u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
 }
 static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
                                           uint8_t *p_dst,
                                           int32_t i_dst_stride )
 {
    uint32_t u_row;
    uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
    v16u8 src0, src1, src2, src3;
    for( u_row = 4; u_row--; )
    {
        u_inp0 = p_src[0];
        p_src += i_src_stride;
        u_inp1 = p_src[0];
        p_src += i_src_stride;
        u_inp2 = p_src[0];
        p_src += i_src_stride;
        u_inp3 = p_src[0];
        p_src += i_src_stride;
        src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
        src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
        src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
        src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
        ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
        p_dst += ( 4 * i_dst_stride );
    }
 }
 static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
                                      int32_t i_src_stride_left,
                                      uint8_t *p_dst, int32_t i_dst_stride,
                                      uint8_t is_above, uint8_t is_left )
 {
    uint32_t u_row;
    uint32_t u_out, u_addition = 0;
    v16u8 src_above, store;
    v8u16 sum_above;
    v4u32 sum;
    if( is_left && is_above )
    {
        src_above = LD_UB( p_src_top );
        sum_above = __msa_hadd_u_h( src_above, src_above );
        sum = __msa_hadd_u_w( sum_above, sum_above );
        u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
        for( u_row = 0; u_row < 4; u_row++ )
        {
            u_addition += p_src_left[u_row * i_src_stride_left];
        }
        u_addition = ( u_addition + 4 ) >> 3;
        store = ( v16u8 ) __msa_fill_b( u_addition );
    }
    else if( is_left )
    {
        for( u_row = 0; u_row < 4; u_row++ )
        {
            u_addition += p_src_left[u_row * i_src_stride_left];
        }
        u_addition = ( u_addition + 2 ) >> 2;
        store = ( v16u8 ) __msa_fill_b( u_addition );
    }
    else if( is_above )
    {
        src_above = LD_UB( p_src_top );
        sum_above = __msa_hadd_u_h( src_above, src_above );
        sum = __msa_hadd_u_w( sum_above, sum_above );
        sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
        store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
    }
    else
    {
        store = ( v16u8 ) __msa_ldi_b( 128 );
    }
    u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
    SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
 }
 static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
                                      uint8_t *p_dst, int32_t i_dst_stride )
 {
    uint64_t u_val0, u_val1;
    v16i8 store;
    v16u8 src = { 0 };
    v8u16 sum_h;
    v4u32 sum_w;
    v2u64 sum_d;
    u_val0 = LD( p_src_top );
    u_val1 = LD( p_src_left );
    INSERT_D2_UB( u_val0, u_val1, src );
    sum_h = __msa_hadd_u_h( src, src );
    sum_w = __msa_hadd_u_w( sum_h, sum_h );
    sum_d = __msa_hadd_u_d( sum_w, sum_w );
    sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
    sum_d = __msa_hadd_u_d( sum_w, sum_w );
    sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
    store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
    u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
    SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
 }
 static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
                                        int32_t i_src_stride_left,
                                        uint8_t *p_dst, int32_t i_dst_stride,
                                        uint8_t is_above, uint8_t is_left )
 {
    uint32_t u_row;
    uint32_t u_addition = 0;
    v16u8 src_above, store;
    v8u16 sum_above;
    v4u32 sum_top;
    v2u64 sum;
    if( is_left && is_above )
    {
        src_above = LD_UB( p_src_top );
        sum_above = __msa_hadd_u_h( src_above, src_above );
        sum_top = __msa_hadd_u_w( sum_above, sum_above );
        sum = __msa_hadd_u_d( sum_top, sum_top );
        sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
        sum = __msa_hadd_u_d( sum_top, sum_top );
        u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
        for( u_row = 0; u_row < 16; u_row++ )
        {
            u_addition += p_src_left[u_row * i_src_stride_left];
        }
        u_addition = ( u_addition + 16 ) >> 5;
        store = ( v16u8 ) __msa_fill_b( u_addition );
    }
    else if( is_left )
    {
        for( u_row = 0; u_row < 16; u_row++ )
        {
            u_addition += p_src_left[u_row * i_src_stride_left];
        }
        u_addition = ( u_addition + 8 ) >> 4;
        store = ( v16u8 ) __msa_fill_b( u_addition );
    }
    else if( is_above )
    {
        src_above = LD_UB( p_src_top );
        sum_above = __msa_hadd_u_h( src_above, src_above );
        sum_top = __msa_hadd_u_w( sum_above, sum_above );
        sum = __msa_hadd_u_d( sum_top, sum_top );
        sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
        sum = __msa_hadd_u_d( sum_top, sum_top );
        sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
        store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
    }
    else
    {
        store = ( v16u8 ) __msa_ldi_b( 128 );
    }
    ST_UB8( store, store, store, store, store, store, store, store, p_dst,
            i_dst_stride );
    p_dst += ( 8 * i_dst_stride );
    ST_UB8( store, store, store, store, store, store, store, store, p_dst,
            i_dst_stride );
 }
 static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
 {
    uint8_t u_lpcnt;
    int32_t i_res, i_res0, i_res1, i_res2, i_res3;
    uint64_t u_out0, u_out1;
    v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
    v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
    v4i32 int_multiplier = { 0, 1, 2, 3 };
    v16u8 p_src_top;
    v8i16 vec9, vec10, vec11;
    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
    v2i64 sum;
    p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
    p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
                                        ( v16i8 ) p_src_top );
    vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
    vec9 *= short_multiplier;
    vec8 = __msa_hadd_s_w( vec9, vec9 );
    sum = __msa_hadd_s_d( vec8, vec8 );
    i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
    i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
             2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
             3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
             4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
    i_res0 *= 17;
    i_res1 *= 17;
    i_res0 = ( i_res0 + 16 ) >> 5;
    i_res1 = ( i_res1 + 16 ) >> 5;
    i_res3 = 3 * ( i_res0 + i_res1 );
    i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
    i_res = i_res2 - i_res3;
    vec8 = __msa_fill_w( i_res0 );
    vec4 = __msa_fill_w( i_res );
    vec2 = __msa_fill_w( i_res1 );
    vec5 = vec8 * int_multiplier;
    vec3 = vec8 * 4;
    for( u_lpcnt = 4; u_lpcnt--; )
    {
        vec0 = vec5;
        vec0 += vec4;
        vec1 = vec0 + vec3;
        vec6 = vec5;
        vec4 += vec2;
        vec6 += vec4;
        vec7 = vec6 + vec3;
        SRA_4V( vec0, vec1, vec6, vec7, 5 );
        PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
        CLIP_SH2_0_255( vec10, vec11 );
        PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
        u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
        u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
        SD( u_out0, p_src );
        p_src += i_stride;
        SD( u_out1, p_src );
        p_src += i_stride;
        vec4 += vec2;
    }
 }
 static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
 {
    uint8_t u_lpcnt;
    int32_t i_res0, i_res1, i_res2, i_res3;
    uint64_t u_load0, u_load1;
    v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
    v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
    v4i32 int_multiplier = { 0, 1, 2, 3 };
    v16u8 p_src_top = { 0 };
    v8i16 vec9, vec10;
    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
    u_load0 = LD( p_src - ( i_stride + 1 ) );
    u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
    INSERT_D2_UB( u_load0, u_load1, p_src_top );
    p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
                                        ( v16i8 ) p_src_top );
    vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
    vec9 *= short_multiplier;
    vec8 = __msa_hadd_s_w( vec9, vec9 );
    res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
    i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
    i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
             2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
             3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
             4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
             5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
             6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
             7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
             8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
    i_res0 *= 5;
    i_res1 *= 5;
    i_res0 = ( i_res0 + 32 ) >> 6;
    i_res1 = ( i_res1 + 32 ) >> 6;
    i_res3 = 7 * ( i_res0 + i_res1 );
    i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
    i_res2 -= i_res3;
    vec8 = __msa_fill_w( i_res0 );
    vec4 = __msa_fill_w( i_res2 );
    vec5 = __msa_fill_w( i_res1 );
    vec6 = vec8 * 4;
    vec7 = vec8 * int_multiplier;
    for( u_lpcnt = 16; u_lpcnt--; )
    {
        vec0 = vec7;
        vec0 += vec4;
        vec1 = vec0 + vec6;
        vec2 = vec1 + vec6;
        vec3 = vec2 + vec6;
        SRA_4V( vec0, vec1, vec2, vec3, 5 );
        PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
        CLIP_SH2_0_255( vec9, vec10 );
        PCKEV_ST_SB( vec9, vec10, p_src );
        p_src += i_stride;
        vec4 += vec5;
    }
 }
 static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
 {
    uint8_t u_lp_cnt;
    uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
    uint32_t u_out0, u_out1, u_out2, u_out3;
    v16u8 p_src_top;
    v8u16 add;
    v4u32 sum;
    p_src_top = LD_UB( p_src - i_stride );
    add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
    sum = __msa_hadd_u_w( add, add );
    u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
    u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
    for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
    {
        u_src0 += p_src[u_lp_cnt * i_stride - 1];
        u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
    }
    u_src0 = ( u_src0 + 4 ) >> 3;
    u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
    u_src1 = ( u_src1 + 2 ) >> 2;
    u_src2 = ( u_src2 + 2 ) >> 2;
    u_out0 = u_src0 * 0x01010101;
    u_out1 = u_src1 * 0x01010101;
    u_out2 = u_src2 * 0x01010101;
    u_out3 = u_src3 * 0x01010101;
    for( u_lp_cnt = 4; u_lp_cnt--; )
    {
        SW( u_out0, p_src );
        SW( u_out1, ( p_src + 4 ) );
        SW( u_out2, ( p_src + 4 * i_stride ) );
        SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
        p_src += i_stride;
    }
 }
 static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
                                       int32_t i_dst_stride )
 {
    uint8_t u_src_val = p_src[15];
    uint64_t u_out0, u_out1, u_out2, u_out3;
    v16u8 src, vec4, vec5, res0;
    v8u16 vec0, vec1, vec2, vec3;
    v2i64 res1, res2, res3;
    src = LD_UB( p_src );
    vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
    vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
    vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
    ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
    ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
    HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
    vec0 += vec1;
    vec2 += vec3;
    vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
    vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
    res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
    res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
    res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
    res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
    u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
    u_out1 = __msa_copy_u_d( res1, 0 );
    u_out2 = __msa_copy_u_d( res2, 0 );
    u_out3 = __msa_copy_u_d( res3, 0 );
    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
    p_dst += ( 4 * i_dst_stride );
    res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
    res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
    res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
    res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
    u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
    u_out1 = __msa_copy_u_d( res1, 0 );
    u_out2 = __msa_copy_u_d( res2, 0 );
    u_out3 = __msa_copy_u_d( res3, 0 );
    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
 }
 static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
                                           int32_t i_dst_stride )
 {
    v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
    ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
    p_dst += ( 8 * i_dst_stride );
    ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
 }
 void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
 {
    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
                                FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
 }
 void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
 {
    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
                                FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
 }
 void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
 {
    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
                                FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
 }
 void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
 {
    intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
 {
    intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
                                   p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
 {
    intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
 {
    intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
 {
    intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
 {
    intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
                                 p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
 {
    intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
 {
    intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
 {
    intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
 {
    intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
                              p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
 {
    intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
 {
    intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
 {
    intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
                              FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
 }
 void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
 {
    intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
                                 p_src, FDEC_STRIDE );
 }
 void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
 {
    intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
 }
 #endif
--- a/Show More
+++ b/Show More