diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index efdc42d349195..b87f9b9ff3c11 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -1,1131 +1,1449 @@ BOLT: - - bolt/**/* + - changed-files: + - any-glob-to-any-file: + - bolt/**/* ClangIR: - - clang/include/clang/CIR/**/* - - clang/lib/CIR/**/* - - clang/tools/cir-*/**/* - - clang/test/CIR/**/* + - changed-files: + - any-glob-to-any-file: + - clang/include/clang/CIR/**/* + - clang/lib/CIR/**/* + - clang/tools/cir-*/**/* + - clang/test/CIR/**/* clang:bytecode: - - clang/docs/ConstantInterpreter.rst - - clang/lib/AST/ByteCode/**/* - - clang/test/AST/ByteCode/**/* - - clang/unittests/AST/ByteCode/**/* + - changed-files: + - any-glob-to-any-file: + - clang/docs/ConstantInterpreter.rst + - clang/lib/AST/ByteCode/**/* + - clang/test/AST/ByteCode/**/* + - clang/unittests/AST/ByteCode/**/* clang:dataflow: - - clang/include/clang/Analysis/FlowSensitive/**/* - - clang/lib/Analysis/FlowSensitive/**/* - - clang/unittests/Analysis/FlowSensitive/**/* - - clang/docs/DataFlowAnalysisIntro.md - - clang/docs/DataFlowAnalysisIntroImages/**/* + - changed-files: + - any-glob-to-any-file: + - clang/include/clang/Analysis/FlowSensitive/**/* + - clang/lib/Analysis/FlowSensitive/**/* + - clang/unittests/Analysis/FlowSensitive/**/* + - clang/docs/DataFlowAnalysisIntro.md + - clang/docs/DataFlowAnalysisIntroImages/**/* clang:frontend: - - clang/lib/AST/**/* - - clang/include/clang/AST/**/* - - clang/lib/Basic/**/* - - clang/include/clang/Basic/**/* - - clang/lib/Interpreter/**/* - - clang/include/clang/Interpreter/**/* - - clang/lib/Lex/**/* - - clang/include/clang/Lex/**/* - - clang/lib/Parse/**/* - - clang/include/clang/Parse/**/* - - clang/lib/Sema/**/* - - clang/include/clang/Sema/**/* + - changed-files: + - any-glob-to-any-file: + - clang/lib/AST/**/* + - clang/include/clang/AST/**/* + - clang/lib/Basic/**/* + - clang/include/clang/Basic/**/* + - clang/lib/Interpreter/**/* + - clang/include/clang/Interpreter/**/* + - clang/lib/Lex/**/* + - clang/include/clang/Lex/**/* + - clang/lib/Parse/**/* + - clang/include/clang/Parse/**/* + - clang/lib/Sema/**/* + - clang/include/clang/Sema/**/* clang:headers: - - clang/lib/Headers/**/* + - changed-files: + - any-glob-to-any-file: + - clang/lib/Headers/**/* compiler-rt: - - compiler-rt/**/* + - changed-files: + - any-glob-to-any-file: + - compiler-rt/**/* flang: - - flang/**/* + - changed-files: + - any-glob-to-any-file: + - flang/**/* flang:frontend: - - flang/Parser/**/* - - flang/Evaluate/**/* - - flang/Semantics/**/* + - changed-files: + - any-glob-to-any-file: + - flang/Parser/**/* + - flang/Evaluate/**/* + - flang/Semantics/**/* libclc: - - libclc/** + - changed-files: + - any-glob-to-any-file: + - libclc/** HLSL: - - clang/*HLSL*/**/* - - clang/**/*HLSL* - - llvm/**/Frontend/HLSL/**/* + - changed-files: + - any-glob-to-any-file: + - clang/*HLSL*/**/* + - clang/**/*HLSL* + - llvm/**/Frontend/HLSL/**/* lld: - - lld/**/* + - changed-files: + - any-glob-to-any-file: + - lld/**/* llvm-lit: - - llvm/utils/lit/**/* + - changed-files: + - any-glob-to-any-file: + - llvm/utils/lit/**/* PGO: - - llvm/**/ProfileData/**/* - - llvm/**/SampleProfile* - - llvm/**/CodeGen/MIRSampleProfile* - - llvm/lib/Transforms/Instrumentation/CGProfile.cpp - - llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp - - llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp - - llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp - - llvm/lib/Transforms/Instrumentation/PGO* - - llvm/lib/Transforms/Instrumentation/ValueProfile* - - llvm/test/Instrumentation/InstrProfiling/**/* - - llvm/test/Transforms/PGOProfile/**/* - - llvm/test/Transforms/SampleProfile/**/* - - llvm/**/llvm-profdata/**/* - - llvm/**/llvm-profgen/**/* + - changed-files: + - any-glob-to-any-file: + - llvm/**/ProfileData/**/* + - llvm/**/SampleProfile* + - llvm/**/CodeGen/MIRSampleProfile* + - llvm/lib/Transforms/Instrumentation/CGProfile.cpp + - llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp + - llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp + - llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp + - llvm/lib/Transforms/Instrumentation/PGO* + - llvm/lib/Transforms/Instrumentation/ValueProfile* + - llvm/test/Instrumentation/InstrProfiling/**/* + - llvm/test/Transforms/PGOProfile/**/* + - llvm/test/Transforms/SampleProfile/**/* + - llvm/**/llvm-profdata/**/* + - llvm/**/llvm-profgen/**/* vectorizers: - - llvm/lib/Transforms/Vectorize/**/* - - llvm/include/llvm/Transforms/Vectorize/**/* + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Transforms/Vectorize/**/* + - llvm/include/llvm/Transforms/Vectorize/**/* # IMPORTED FROM CODEOWNERS LTO: - - llvm/*/LTO/** - - llvm/*/Linker/** - - llvm/*/ThinLTO/** - - llvm/lib/Transforms/*/FunctionImport* - - llvm/tools/gold/** + - changed-files: + - any-glob-to-any-file: + - llvm/*/LTO/** + - llvm/*/Linker/** + - llvm/*/ThinLTO/** + - llvm/lib/Transforms/*/FunctionImport* + - llvm/tools/gold/** clang:driver: - - clang/*/Driver/** + - changed-files: + - any-glob-to-any-file: + - clang/*/Driver/** compiler-rt:asan: - - compiler-rt/lib/asan/** - - compiler-rt/include/sanitizer/asan_interface.h - - compiler-rt/test/asan/** - - compiler-rt/lib/asan_abi/** - - compiler-rt/test/asan_abi/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/asan/** + - compiler-rt/include/sanitizer/asan_interface.h + - compiler-rt/test/asan/** + - compiler-rt/lib/asan_abi/** + - compiler-rt/test/asan_abi/** compiler-rt:builtins: - - compiler-rt/lib/builtins/** - - compiler-rt/test/builtins/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/builtins/** + - compiler-rt/test/builtins/** compiler-rt:cfi: - - compiler-rt/lib/cfi/** - - compiler-rt/test/cfi/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/cfi/** + - compiler-rt/test/cfi/** compiler-rt:fuzzer: - - compiler-rt/lib/fuzzer/** - - compiler-rt/include/fuzzer/** - - compiler-rt/test/fuzzer/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/fuzzer/** + - compiler-rt/include/fuzzer/** + - compiler-rt/test/fuzzer/** compiler-rt:hwasan: - - compiler-rt/lib/hwasan/** - - compiler-rt/include/sanitizer/hwasan_interface.h - - compiler-rt/test/hwasan/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/hwasan/** + - compiler-rt/include/sanitizer/hwasan_interface.h + - compiler-rt/test/hwasan/** compiler-rt:lsan: - - compiler-rt/lib/lsan/** - - compiler-rt/include/sanitizer/lsan_interface.h - - compiler-rt/test/lsan/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/lsan/** + - compiler-rt/include/sanitizer/lsan_interface.h + - compiler-rt/test/lsan/** compiler-rt:msan: - - compiler-rt/lib/msan/** - - compiler-rt/include/sanitizer/msan_interface.h - - compiler-rt/test/msan/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/msan/** + - compiler-rt/include/sanitizer/msan_interface.h + - compiler-rt/test/msan/** compiler-rt:sanitizer: - - llvm/lib/Transforms/Instrumentation/*Sanitizer* - - compiler-rt/lib/interception/** - - compiler-rt/lib/*san*/** - - compiler-rt/include/sanitizer/** - - compiler-rt/test/*san*/** - - compiler-rt/lib/fuzzer/** - - compiler-rt/include/fuzzer/** - - compiler-rt/test/fuzzer/** - - compiler-rt/lib/scudo/** - - compiler-rt/test/scudo/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Transforms/Instrumentation/*Sanitizer* + - compiler-rt/lib/interception/** + - compiler-rt/lib/*san*/** + - compiler-rt/include/sanitizer/** + - compiler-rt/test/*san*/** + - compiler-rt/lib/fuzzer/** + - compiler-rt/include/fuzzer/** + - compiler-rt/test/fuzzer/** + - compiler-rt/lib/scudo/** + - compiler-rt/test/scudo/** compiler-rt:scudo: - - compiler-rt/lib/scudo/** - - compiler-rt/test/scudo/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/scudo/** + - compiler-rt/test/scudo/** compiler-rt:tsan: - - compiler-rt/lib/tsan/** - - compiler-rt/include/sanitizer/tsan_interface.h - - compiler-rt/include/sanitizer/tsan_interface_atomic.h - - compiler-rt/test/tsan/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/tsan/** + - compiler-rt/include/sanitizer/tsan_interface.h + - compiler-rt/include/sanitizer/tsan_interface_atomic.h + - compiler-rt/test/tsan/** compiler-rt:ubsan: - - compiler-rt/lib/ubsan/** - - compiler-rt/include/sanitizer/ubsan_interface.h - - compiler-rt/test/ubsan/** - - compiler-rt/lib/ubsan_minimal/** - - compiler-rt/test/ubsan_minimal/** + - changed-files: + - any-glob-to-any-file: + - compiler-rt/lib/ubsan/** + - compiler-rt/include/sanitizer/ubsan_interface.h + - compiler-rt/test/ubsan/** + - compiler-rt/lib/ubsan_minimal/** + - compiler-rt/test/ubsan_minimal/** xray: - - llvm/tools/llvm-xray/** - - compiler-rt/*/xray/** - - clang/include/clang/Basic/XRay* - - clang/lib/Basic/XRay* - - compiler-rt/*/xray/** - - llvm/include/llvm/XRay/** - - llvm/lib/XRay/** - - llvm/tools/llvm-xray/** - - llvm/unittests/XRay/** - - compiler-rt/*/xray/** + - changed-files: + - any-glob-to-any-file: + - llvm/tools/llvm-xray/** + - compiler-rt/*/xray/** + - clang/include/clang/Basic/XRay* + - clang/lib/Basic/XRay* + - compiler-rt/*/xray/** + - llvm/include/llvm/XRay/** + - llvm/lib/XRay/** + - llvm/tools/llvm-xray/** + - llvm/unittests/XRay/** + - compiler-rt/*/xray/** clang:codegen: - - clang/lib/CodeGen/** - - clang/include/clang/CodeGen/** + - changed-files: + - any-glob-to-any-file: + - clang/lib/CodeGen/** + - clang/include/clang/CodeGen/** mlir: - - mlir/** + - changed-files: + - any-glob-to-any-file: + - mlir/** mlir:core: - - mlir/include/mlir/Support/** - - mlir/lib/Support/** - - mlir/include/mlir/Parser/** - - mlir/lib/Parser/** - - mlir/include/mlir/IR/** - - mlir/lib/IR/** - - mlir/include/mlir/Bytecode/** - - mlir/lib/Bytecode/** - - mlir/include/mlir/AsmParser/** - - mlir/lib/AsmParser/** - - mlir/include/mlir/Pass/** - - mlir/lib/Pass/** - - mlir/include/mlir/Tools/** - - mlir/lib/Tools/** - - mlir/include/mlir/Reducer/** - - mlir/lib/Reducer/** - - mlir/include/mlir/Transforms/** - - mlir/lib/Transforms/** - - mlir/include/mlir/Debug/** - - mlir/lib/Debug/** - - mlir/tools/** + - changed-files: + - any-glob-to-any-file: + - mlir/include/mlir/Support/** + - mlir/lib/Support/** + - mlir/include/mlir/Parser/** + - mlir/lib/Parser/** + - mlir/include/mlir/IR/** + - mlir/lib/IR/** + - mlir/include/mlir/Bytecode/** + - mlir/lib/Bytecode/** + - mlir/include/mlir/AsmParser/** + - mlir/lib/AsmParser/** + - mlir/include/mlir/Pass/** + - mlir/lib/Pass/** + - mlir/include/mlir/Tools/** + - mlir/lib/Tools/** + - mlir/include/mlir/Reducer/** + - mlir/lib/Reducer/** + - mlir/include/mlir/Transforms/** + - mlir/lib/Transforms/** + - mlir/include/mlir/Debug/** + - mlir/lib/Debug/** + - mlir/tools/** mlir:ods: - - mlir/TableGen/** - - mlir/tblgen/** - - mlir/include/mlir/IR/*.td + - changed-files: + - any-glob-to-any-file: + - mlir/TableGen/** + - mlir/tblgen/** + - mlir/include/mlir/IR/*.td mlir:bindings: - - mlir/Bindings/** + - changed-files: + - any-glob-to-any-file: + - mlir/Bindings/** mlir:gpu: - - mlir/**/*GPU*/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*GPU*/** mlir:amdgpu: - - mlir/**/AMDGPU/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/AMDGPU/** mlir:amx: - - mlir/**/AMX/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/AMX/** mlir:affine: - - mlir/**/Affine/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Affine/** mlir:arith: - - mlir/**/Arith/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Arith/** mlir:neon: - - mlir/**/ArmNeon/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/ArmNeon/** mlir:sme: - - mlir/**/ArmSME/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/ArmSME/** mlir:sve: - - mlir/**/ArmSVE/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/ArmSVE/** mlir:async: - - mlir/**/Async/** - - mlir/**/Async/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Async/** + - mlir/**/Async/** mlir:bufferization: - - mlir/**/Bufferization/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Bufferization/** mlir:complex: - - mlir/**/Complex/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Complex/** mlir:cf: - - mlir/**/ControlFlow/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/ControlFlow/** mlir:dlti: - - mlir/**/DLTI/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/DLTI/** mlir:emitc: - - mlir/**/*EmitC*/** - - mlir/lib/Target/Cpp/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*EmitC*/** + - mlir/lib/Target/Cpp/** mlir:func: - - mlir/**/Func/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Func/** mlir:irdl: - - mlir/**/IRDL/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/IRDL/** mlir:index: - - mlir/**/Index/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Index/** mlir:llvm: - - mlir/**/LLVM* - - mlir/**/LLVM*/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/LLVM* + - mlir/**/LLVM*/** mlir:linalg: - - mlir/**/*linalg/** - - mlir/**/*Linalg/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*linalg/** + - mlir/**/*Linalg/** mlir:mlprogram: - - mlir/**/MLProgram/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/MLProgram/** mlir:math: - - mlir/**/Math/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Math/** mlir:memref: - - mlir/**/MemRef/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/MemRef/** mlir:nvgpu: - - mlir/**/NVGPU/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/NVGPU/** mlir:openacc: - - mlir/**/*OpenACC* - - mlir/**/*OpenACC*/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*OpenACC* + - mlir/**/*OpenACC*/** mlir:openmp: - - mlir/**/*OpenMP* - - mlir/**/*OpenMP*/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*OpenMP* + - mlir/**/*OpenMP*/** mlir:pdl: - - mlir/**/PDL/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/PDL/** mlir:quant: - - mlir/**/Quant/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Quant/** mlir:scf: - - mlir/**/SCF/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/SCF/** mlir:spirv: - - mlir/**/SPIRV/** - - mlir/**/SPIRVTo*/** - - mlir/**/*ToSPIRV/** - - mlir/tools/mlir-spirv-cpu-runner/** - - mlir/tools/mlir-vulkan-runner/** - - mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp + - changed-files: + - any-glob-to-any-file: + - mlir/**/SPIRV/** + - mlir/**/SPIRVTo*/** + - mlir/**/*ToSPIRV/** + - mlir/tools/mlir-spirv-cpu-runner/** + - mlir/tools/mlir-vulkan-runner/** + - mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp mlir:shape: - - mlir/**/Shape/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Shape/** mlir:sparse: - - mlir/**/SparseTensor/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/SparseTensor/** mlir:tensor: - - mlir/**/Tensor/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/Tensor/** mlir:tosa: - - mlir/**/*Tosa*/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*Tosa*/** mlir:ub: - - mlir/**/UB/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/UB/** mlir:vector: - - mlir/**/*Vector/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*Vector/** mlir:execution-engine: - - mlir/**/ExecutionEngine/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/ExecutionEngine/** mlir:presburger: - - mlir/**/*Presburger*/** + - changed-files: + - any-glob-to-any-file: + - mlir/**/*Presburger*/** mlir:python: - - mlir/python/**/* + - changed-files: + - any-glob-to-any-file: + - mlir/python/**/* mlir:vectorops: - - mlir/**/Vector/**/* + - changed-files: + - any-glob-to-any-file: + - mlir/**/Vector/**/* coroutines: - - clang/docs/DebuggingCoroutines.rst - - clang/lib/Sema/SemaCoroutine.cpp - - clang/lib/CodeGen/CGCoroutine.cpp - - clang/test/CodeGenCoroutines/** - - llvm/docs/Coroutines.rst - - llvm/include/llvm/Transforms/Coroutines/** - - llvm/lib/Transforms/Coroutines/** - - llvm/test/Transforms/Coroutines/* + - changed-files: + - any-glob-to-any-file: + - clang/docs/DebuggingCoroutines.rst + - clang/lib/Sema/SemaCoroutine.cpp + - clang/lib/CodeGen/CGCoroutine.cpp + - clang/test/CodeGenCoroutines/** + - llvm/docs/Coroutines.rst + - llvm/include/llvm/Transforms/Coroutines/** + - llvm/lib/Transforms/Coroutines/** + - llvm/test/Transforms/Coroutines/* clang:modules: - - clang/docs/StandardCPlusPlusModules.rst - - clang/include/clang/AST/AbstractBasicReader.h - - clang/include/clang/AST/AbstractBasicWriter.h - - clang/include/clang/AST/AbstractTypeReader.h - - clang/include/clang/AST/AbstractTypeWriter.h - - clang/include/clang/AST/PropertiesBase.td - - clang/include/clang/AST/ODRHash.h - - clang/include/clang/AST/TypeProperties.td - - clang/include/clang/Basic/Module.h - - clang/include/clang/Frontend/PrecompiledPreamble.h - - clang/include/clang/Lex/ModuleLoader.h - - clang/include/clang/Lex/ModuleMap.h - - clang/include/clang/Serialization/** - - clang/lib/AST/ODRHash.cpp - - clang/lib/AST/StmtProfile.cpp - - clang/lib/Basic/Module.cpp - - clang/lib/Frontend/ModuleDependencyCollector.cpp - - clang/lib/Frontend/PrecompiledPreamble.cpp - - clang/lib/Lex/ModuleMap.cpp - - clang/lib/Sema/SemaModule.cpp - - clang/lib/Serialization/** - - clang/test/CXX/module/** - - clang/test/Modules/** - - clang/unittests/Serialization/* + - changed-files: + - any-glob-to-any-file: + - clang/docs/StandardCPlusPlusModules.rst + - clang/include/clang/AST/AbstractBasicReader.h + - clang/include/clang/AST/AbstractBasicWriter.h + - clang/include/clang/AST/AbstractTypeReader.h + - clang/include/clang/AST/AbstractTypeWriter.h + - clang/include/clang/AST/PropertiesBase.td + - clang/include/clang/AST/ODRHash.h + - clang/include/clang/AST/TypeProperties.td + - clang/include/clang/Basic/Module.h + - clang/include/clang/Frontend/PrecompiledPreamble.h + - clang/include/clang/Lex/ModuleLoader.h + - clang/include/clang/Lex/ModuleMap.h + - clang/include/clang/Serialization/** + - clang/lib/AST/ODRHash.cpp + - clang/lib/AST/StmtProfile.cpp + - clang/lib/Basic/Module.cpp + - clang/lib/Frontend/ModuleDependencyCollector.cpp + - clang/lib/Frontend/PrecompiledPreamble.cpp + - clang/lib/Lex/ModuleMap.cpp + - clang/lib/Sema/SemaModule.cpp + - clang/lib/Serialization/** + - clang/test/CXX/module/** + - clang/test/Modules/** + - clang/unittests/Serialization/* clang-tidy: - - clang-tools-extra/clang-tidy/** - - clang-tools-extra/docs/clang-tidy/** - - clang-tools-extra/test/clang-tidy/** + - changed-files: + - any-glob-to-any-file: + - clang-tools-extra/clang-tidy/** + - clang-tools-extra/docs/clang-tidy/** + - clang-tools-extra/test/clang-tidy/** clang-tools-extra: - - clang-tools-extra/** + - changed-files: + - any-glob-to-any-file: + - clang-tools-extra/** tools:llvm-mca: - - llvm/tools/llvm-mca/** - - llvm/include/llvm/MCA/** - - llvm/lib/MCA/** + - changed-files: + - any-glob-to-any-file: + - llvm/tools/llvm-mca/** + - llvm/include/llvm/MCA/** + - llvm/lib/MCA/** clang: - - any: - - clang/** - - '!clang/**/Format/**' - - '!clang/tools/clang-format/**' + - changed-files: + - all-globs-to-all-files: + - clang/** + - '!clang/**/Format/**' + - '!clang/tools/clang-format/**' testing-tools: - - llvm/include/llvm/FileCheck/** - - llvm/lib/FileCheck/** - - llvm/test/FileCheck/** - - llvm/unittests/FileCheck/** - - llvm/utils/lit/** - - llvm/utils/split-file/** - - llvm/utils/not/** - - llvm/utils/count/** - - llvm/utils/FileCheck/** - - llvm/docs/CommandGuide/FileCheck.rst - - llvm/docs/CommandGuide/lit.rst - - llvm/docs/TestingGuide.rst - - llvm/test/Other/FileCheck-space.txt - - llvm/utils/UpdateTestChecks/** - - llvm/utils/update*_test_checks.py + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/FileCheck/** + - llvm/lib/FileCheck/** + - llvm/test/FileCheck/** + - llvm/unittests/FileCheck/** + - llvm/utils/lit/** + - llvm/utils/split-file/** + - llvm/utils/not/** + - llvm/utils/count/** + - llvm/utils/FileCheck/** + - llvm/docs/CommandGuide/FileCheck.rst + - llvm/docs/CommandGuide/lit.rst + - llvm/docs/TestingGuide.rst + - llvm/test/Other/FileCheck-space.txt + - llvm/utils/UpdateTestChecks/** + - llvm/utils/update*_test_checks.py debuginfo: - - clang/lib/CodeGen/CGDebugInfo.* - - llvm/include/llvm/BinaryFormat/Dwarf.* - - llvm/include/llvm/CodeGen/*Debug*.* - - llvm/include/llvm/DebugInfo/** - - llvm/include/llvm/Debuginfod/** - - llvm/include/llvm/Frontend/Debug/** - - llvm/include/llvm/IR/Debug*.* - - llvm/include/llvm/Object/*Debug*.* - - llvm/include/llvm/ObjectYAML/*Debug*.* - - llvm/include/llvm/Transforms/Utils/*Debug*.* - - llvm/include/llvm-c/DebugInfo.h - - llvm/lib/BinaryFormat/Dwarf.cpp - - llvm/lib/CodeGen/AsmPrinter/*Debug*.* - - llvm/lib/CodeGen/AsmPrinter/Dwarf*.* - - llvm/lib/CodeGen/AsmPrinter/DIE*.* - - llvm/lib/CodeGen/LiveDebugValues/** - - llvm/lib/CodeGen/*Debug*.* - - llvm/lib/CodeGen/DwarfEHPrepare.cpp - - llvm/lib/DebugInfo/** - - llvm/lib/Debuginfod/** - - llvm/lib/DWARFLinkerParallel/** - - llvm/lib/IR/Debug*.cpp - - llvm/lib/MC/MCDwarf.cpp - - llvm/lib/Transforms/Utils/*Debug*.* - - llvm/test/DebugInfo/** - - llvm/test/tools/dsymutil/** - - llvm/test/tools/llvm-debuginfo-analyzer/** - - llvm/test/tools/llvm-debuginfod/** - - llvm/test/tools/llvm-debuginfod-find/** - - llvm/test/tools/llvm-dwarfdump/** - - llvm/test/tools/llvm-dwarfutil/** - - llvm/test/tools/llvm-dwp/** - - llvm/test/tools/llvm-gsymutil/** - - llvm/test/tools/llvm-pdbuti/** - - llvm/tools/dsymutil/** - - llvm/tools/llvm-debuginfo-analyzer/** - - llvm/tools/llvm-debuginfod/** - - llvm/tools/llvm-debuginfod-find/** - - llvm/tools/llvm-dwarfdump/** - - llvm/tools/llvm-dwarfutil/** - - llvm/tools/llvm-dwp/** - - llvm/tools/llvm-gsymutil/** - - llvm/tools/llvm-pdbutil/** + - changed-files: + - any-glob-to-any-file: + - clang/lib/CodeGen/CGDebugInfo.* + - llvm/include/llvm/BinaryFormat/Dwarf.* + - llvm/include/llvm/CodeGen/*Debug*.* + - llvm/include/llvm/DebugInfo/** + - llvm/include/llvm/Debuginfod/** + - llvm/include/llvm/Frontend/Debug/** + - llvm/include/llvm/IR/Debug*.* + - llvm/include/llvm/Object/*Debug*.* + - llvm/include/llvm/ObjectYAML/*Debug*.* + - llvm/include/llvm/Transforms/Utils/*Debug*.* + - llvm/include/llvm-c/DebugInfo.h + - llvm/lib/BinaryFormat/Dwarf.cpp + - llvm/lib/CodeGen/AsmPrinter/*Debug*.* + - llvm/lib/CodeGen/AsmPrinter/Dwarf*.* + - llvm/lib/CodeGen/AsmPrinter/DIE*.* + - llvm/lib/CodeGen/LiveDebugValues/** + - llvm/lib/CodeGen/*Debug*.* + - llvm/lib/CodeGen/DwarfEHPrepare.cpp + - llvm/lib/DebugInfo/** + - llvm/lib/Debuginfod/** + - llvm/lib/DWARFLinkerParallel/** + - llvm/lib/IR/Debug*.cpp + - llvm/lib/MC/MCDwarf.cpp + - llvm/lib/Transforms/Utils/*Debug*.* + - llvm/test/DebugInfo/** + - llvm/test/tools/dsymutil/** + - llvm/test/tools/llvm-debuginfo-analyzer/** + - llvm/test/tools/llvm-debuginfod/** + - llvm/test/tools/llvm-debuginfod-find/** + - llvm/test/tools/llvm-dwarfdump/** + - llvm/test/tools/llvm-dwarfutil/** + - llvm/test/tools/llvm-dwp/** + - llvm/test/tools/llvm-gsymutil/** + - llvm/test/tools/llvm-pdbuti/** + - llvm/tools/dsymutil/** + - llvm/tools/llvm-debuginfo-analyzer/** + - llvm/tools/llvm-debuginfod/** + - llvm/tools/llvm-debuginfod-find/** + - llvm/tools/llvm-dwarfdump/** + - llvm/tools/llvm-dwarfutil/** + - llvm/tools/llvm-dwp/** + - llvm/tools/llvm-gsymutil/** + - llvm/tools/llvm-pdbutil/** github:workflow: - - .github/workflows/** + - changed-files: + - any-glob-to-any-file: + - .github/workflows/** cmake: - - cmake/** - - llvm/cmake/** - - runtimes/** + - changed-files: + - any-glob-to-any-file: + - cmake/** + - llvm/cmake/** + - runtimes/** flang:driver: - - flang/tools/flang-driver/** - - flang/unittests/Frontend/** - - flang/lib/FrontendTool/** - - flang/lib/Frontend/** - - flang/include/flang/Frontend/** - - flang/include/flang/FrontendTool/** - - flang/test/Driver/** + - changed-files: + - any-glob-to-any-file: + - flang/tools/flang-driver/** + - flang/unittests/Frontend/** + - flang/lib/FrontendTool/** + - flang/lib/Frontend/** + - flang/include/flang/Frontend/** + - flang/include/flang/FrontendTool/** + - flang/test/Driver/** backend:m68k: - - llvm/lib/Target/M68k/** - - clang/lib/Basic/Targets/M68k.* - - clang/lib/CodeGen/Targets/M68k.cpp - - llvm/test/CodeGen/M68k/** - - llvm/test/MC/Disassembler/M68k/** - - llvm/test/MC/M68k/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Target/M68k/** + - clang/lib/Basic/Targets/M68k.* + - clang/lib/CodeGen/Targets/M68k.cpp + - llvm/test/CodeGen/M68k/** + - llvm/test/MC/Disassembler/M68k/** + - llvm/test/MC/M68k/** libc++: - - libcxx/** - - .github/workflows/libcxx-* + - changed-files: + - any-glob-to-any-file: + - libcxx/** + - .github/workflows/libcxx-* libc++abi: - - libcxxabi/** + - changed-files: + - any-glob-to-any-file: + - libcxxabi/** libunwind: - - libunwind/** + - changed-files: + - any-glob-to-any-file: + - libunwind/** objectyaml: - - llvm/include/llvm/ObjectYAML/** - - llvm/lib/ObjectYAML/** - - llvm/test/tools/obj2yaml/** - - llvm/test/tools/yaml2obj/** - - llvm/tools/obj2yaml/** - - llvm/tools/yaml2obj/** + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/ObjectYAML/** + - llvm/lib/ObjectYAML/** + - llvm/test/tools/obj2yaml/** + - llvm/test/tools/yaml2obj/** + - llvm/tools/obj2yaml/** + - llvm/tools/yaml2obj/** clang:analysis: - - clang/include/clang/Analysis/** - - clang/lib/Analysis/** + - changed-files: + - any-glob-to-any-file: + - clang/include/clang/Analysis/** + - clang/lib/Analysis/** clang:static analyzer: - - clang/include/clang/StaticAnalyzer/** - - clang/lib/StaticAnalyzer/** - - clang/tools/scan-build/** - - clang/utils/analyzer/** - - clang/docs/analyzer/** - - clang/test/Analysis/** + - changed-files: + - any-glob-to-any-file: + - clang/include/clang/StaticAnalyzer/** + - clang/lib/StaticAnalyzer/** + - clang/tools/scan-build/** + - clang/utils/analyzer/** + - clang/docs/analyzer/** + - clang/test/Analysis/** pgo: - - llvm/lib/Transforms/Instrumentation/CGProfile.cpp - - llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp - - llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp - - llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp - - llvm/lib/Transforms/Instrumentation/PGO* - - llvm/lib/Transforms/Instrumentation/ValueProfile* - - llvm/test/Instrumentation/InstrProfiling/** - - llvm/test/Transforms/PGOProfile/** - - compiler-rt/lib/profile/** - - compiler-rt/lib/memprof/** - - compiler-rt/test/profile/** - - compiler-rt/test/memprof/** - - llvm/tools/llvm-profdata/** - - llvm/tools/llvm-profgen/** - - llvm/test/tools/llvm-profdata/** - - llvm/test/tools/llvm-profgen/** - - llvm/unittests/ProfileData/* + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Transforms/Instrumentation/CGProfile.cpp + - llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp + - llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp + - llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp + - llvm/lib/Transforms/Instrumentation/PGO* + - llvm/lib/Transforms/Instrumentation/ValueProfile* + - llvm/test/Instrumentation/InstrProfiling/** + - llvm/test/Transforms/PGOProfile/** + - compiler-rt/lib/profile/** + - compiler-rt/lib/memprof/** + - compiler-rt/test/profile/** + - compiler-rt/test/memprof/** + - llvm/tools/llvm-profdata/** + - llvm/tools/llvm-profgen/** + - llvm/test/tools/llvm-profdata/** + - llvm/test/tools/llvm-profgen/** + - llvm/unittests/ProfileData/* openacc: - - flang/**/OpenACC/** - - flang/include/flang/Lower/OpenACC.h - - flang/docs/OpenACC.md - - flang/lib/Parser/openacc-parsers.cpp - - flang/lib/Lower/OpenACC.cpp - - llvm/**/Frontend/OpenACC/** - - llvm/unittests/Frontend/OpenACCTest.cpp - - mlir/test/Target/LLVMIR/openacc-llvm.mlir - - mlir/**/*OpenACC/** + - changed-files: + - any-glob-to-any-file: + - flang/**/OpenACC/** + - flang/include/flang/Lower/OpenACC.h + - flang/docs/OpenACC.md + - flang/lib/Parser/openacc-parsers.cpp + - flang/lib/Lower/OpenACC.cpp + - llvm/**/Frontend/OpenACC/** + - llvm/unittests/Frontend/OpenACCTest.cpp + - mlir/test/Target/LLVMIR/openacc-llvm.mlir + - mlir/**/*OpenACC/** flang:runtime: - - flang/runtime/** + - changed-files: + - any-glob-to-any-file: + - flang/runtime/** flang:parser: - - flang/**/Parser/** + - changed-files: + - any-glob-to-any-file: + - flang/**/Parser/** flang:semantics: - - flang/**/Evaluate/** - - flang/**/Semantics/** + - changed-files: + - any-glob-to-any-file: + - flang/**/Evaluate/** + - flang/**/Semantics/** flang:fir-hlfir: - - flang/**/Lower/** - - flang/**/Optimizer/** + - changed-files: + - any-glob-to-any-file: + - flang/**/Lower/** + - flang/**/Optimizer/** flang:codegen: - - flang/**/CodeGen/** + - changed-files: + - any-glob-to-any-file: + - flang/**/CodeGen/** llvm:codegen: - - llvm/lib/CodeGen/* - - llvm/lib/CodeGen/MIRParser/* - - llvm/lib/CodeGen/LiveDebugValues/* - - llvm/lib/CodeGen/AsmPrinter/* + - changed-files: + - any-glob-to-any-file: + - llvm/lib/CodeGen/* + - llvm/lib/CodeGen/MIRParser/* + - llvm/lib/CodeGen/LiveDebugValues/* + - llvm/lib/CodeGen/AsmPrinter/* llvm:globalisel: - - llvm/**/GlobalISel/** - - llvm/utils/TableGen/GlobalISel* + - changed-files: + - any-glob-to-any-file: + - llvm/**/GlobalISel/** + - llvm/utils/TableGen/GlobalISel* function-specialization: - - llvm/include/llvm/Transforms/Utils/SCCPSolver.h - - llvm/lib/Transforms/Utils/SCCPSolver.cpp - - llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h - - llvm/lib/Transforms/IPO/FunctionSpecialization.cpp - - llvm/test/Transforms/FunctionSpecialization/* + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/Transforms/Utils/SCCPSolver.h + - llvm/lib/Transforms/Utils/SCCPSolver.cpp + - llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h + - llvm/lib/Transforms/IPO/FunctionSpecialization.cpp + - llvm/test/Transforms/FunctionSpecialization/* libc: - - libc/** - - utils/bazel/llvm-project-overlay/libc/** + - changed-files: + - any-glob-to-any-file: + - libc/** + - utils/bazel/llvm-project-overlay/libc/** clang-format: - - clang/**/Format/** - - clang/tools/clang-format/** + - changed-files: + - any-glob-to-any-file: + - clang/**/Format/** + - clang/tools/clang-format/** flang:openmp: - - flang/test/**/OpenMP/** - - flang/lib/Lower/OpenMP.cpp - - flang/lib/Semantics/resolve-directives.cpp - - flang/lib/Semantics/check-omp-structure.cpp - - flang/lib/Optimizer/Transforms/OMP* - - flang/test/Fir/convert-to-llvm-openmp-and-fir.fir - - flang/test/Lower/OpenMP/** - - flang/test/Transforms/omp* - - mlir/**/*OpenMP* - - mlir/test/Target/LLVMIR/openmp* - - llvm/lib/Frontend/OpenMP/** - - llvm/include/llvm/Frontend/OpenMP/** - - llvm/unittests/Frontend/OpenMP* + - changed-files: + - any-glob-to-any-file: + - flang/test/**/OpenMP/** + - flang/lib/Lower/OpenMP.cpp + - flang/lib/Semantics/resolve-directives.cpp + - flang/lib/Semantics/check-omp-structure.cpp + - flang/lib/Optimizer/Transforms/OMP* + - flang/test/Fir/convert-to-llvm-openmp-and-fir.fir + - flang/test/Lower/OpenMP/** + - flang/test/Transforms/omp* + - mlir/**/*OpenMP* + - mlir/test/Target/LLVMIR/openmp* + - llvm/lib/Frontend/OpenMP/** + - llvm/include/llvm/Frontend/OpenMP/** + - llvm/unittests/Frontend/OpenMP* llvm:ir: - - llvm/lib/IR/** - - llvm/include/llvm/IR/** - - llvm/docs/LangRef.rst - - llvm/unittests/IR/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/IR/** + - llvm/include/llvm/IR/** + - llvm/docs/LangRef.rst + - llvm/unittests/IR/** llvm:SandboxIR: - - llvm/lib/SandboxIR/** - - llvm/include/llvm/SandboxIR/** - - llvm/docs/SandboxIR.md - - llvm/unittests/SandboxIR/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/SandboxIR/** + - llvm/include/llvm/SandboxIR/** + - llvm/docs/SandboxIR.md + - llvm/unittests/SandboxIR/** llvm:analysis: - - llvm/lib/Analysis/** - - llvm/include/llvm/Analysis/** - - llvm/test/Analysis/** - - llvm/unittests/Analysis/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Analysis/** + - llvm/include/llvm/Analysis/** + - llvm/test/Analysis/** + - llvm/unittests/Analysis/** llvm:adt: - - llvm/**/ADT/* + - changed-files: + - any-glob-to-any-file: + - llvm/**/ADT/* llvm:support: - - llvm/**/Support/** + - changed-files: + - any-glob-to-any-file: + - llvm/**/Support/** # Skip llvm/test/MC and llvm/unittests/MC, which includes target-specific directories. llvm:mc: - - llvm/include/llvm/MC/** - - llvm/lib/MC/** - - llvm/tools/llvm-mc/** + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/MC/** + - llvm/lib/MC/** + - llvm/tools/llvm-mc/** llvm:transforms: - - llvm/lib/Transforms/** - - llvm/include/llvm/Transforms/** - - llvm/test/Transforms/** - - llvm/unittests/Transforms/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Transforms/** + - llvm/include/llvm/Transforms/** + - llvm/test/Transforms/** + - llvm/unittests/Transforms/** llvm:instcombine: - - llvm/lib/Analysis/InstructionSimplify.cpp - - llvm/lib/Transforms/InstCombine/** - - llvm/include/llvm/Transforms/InstCombine/ - - llvm/include/llvm/Analysis/InstructionSimplify.h - - llvm/test/Transforms/InstCombine/** - - llvm/test/Transforms/InstSimplify/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Analysis/InstructionSimplify.cpp + - llvm/lib/Transforms/InstCombine/** + - llvm/include/llvm/Transforms/InstCombine/ + - llvm/include/llvm/Analysis/InstructionSimplify.h + - llvm/test/Transforms/InstCombine/** + - llvm/test/Transforms/InstSimplify/** llvm:vectorcombine: - - llvm/lib/Transforms/Vectorize/VectorCombine.cpp - - llvm/test/Transforms/VectorCombine/** + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Transforms/Vectorize/VectorCombine.cpp + - llvm/test/Transforms/VectorCombine/** clangd: - - clang-tools-extra/clangd/** + - changed-files: + - any-glob-to-any-file: + - clang-tools-extra/clangd/** hlsl: - - clang/test/ParserHLSL/** - - clang/test/SemaHLSL/** - - clang/test/AST/HLSL/** - - clang/test/CodeGenHLSL/** - - clang/cmake/caches/HLSL.cmake - - clang/include/clang/Basic/HLSL*.h - - clang/include/clang/Sema/HLSL*.h - - clang/docs/HLSL/** - - clang/lib/Driver/ToolChains/HLSL* - - clang/lib/Parse/ParseHLSL.cpp - - clang/lib/Sema/HLSLExternalSemaSource.cpp - - clang/lib/Sema/SemaHLSL.cpp - - clang/lib/CodeGen/CGHLSLRuntime.* - - clang/lib/CodeGen/CGHLSLBuiltins.cpp - - llvm/include/llvm/Frontend/HLSL/** - - llvm/lib/Frontend/HLSL/** + - changed-files: + - any-glob-to-any-file: + - clang/test/ParserHLSL/** + - clang/test/SemaHLSL/** + - clang/test/AST/HLSL/** + - clang/test/CodeGenHLSL/** + - clang/cmake/caches/HLSL.cmake + - clang/include/clang/Basic/HLSL*.h + - clang/include/clang/Sema/HLSL*.h + - clang/docs/HLSL/** + - clang/lib/Driver/ToolChains/HLSL* + - clang/lib/Parse/ParseHLSL.cpp + - clang/lib/Sema/HLSLExternalSemaSource.cpp + - clang/lib/Sema/SemaHLSL.cpp + - clang/lib/CodeGen/CGHLSLRuntime.* + - clang/lib/CodeGen/CGHLSLBuiltins.cpp + - llvm/include/llvm/Frontend/HLSL/** + - llvm/lib/Frontend/HLSL/** llvm:SelectionDAG: - - llvm/include/llvm/CodeGen/SelectionDAG*.h - - llvm/include/llvm/CodeGen/SDNodeProperties.td - - llvm/include/llvm/Target/TargetSelectionDAG.td - - llvm/lib/CodeGen/SelectionDAG/** - - llvm/utils/TableGen/CodeGenDAG* - - llvm/utils/TableGen/DAGISel* - - llvm/include/llvm/CodeGen/DAGCombine.h - - llvm/include/llvm/CodeGen/ISDOpcodes.h + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/CodeGen/SelectionDAG*.h + - llvm/include/llvm/CodeGen/SDNodeProperties.td + - llvm/include/llvm/Target/TargetSelectionDAG.td + - llvm/lib/CodeGen/SelectionDAG/** + - llvm/utils/TableGen/CodeGenDAG* + - llvm/utils/TableGen/DAGISel* + - llvm/include/llvm/CodeGen/DAGCombine.h + - llvm/include/llvm/CodeGen/ISDOpcodes.h backend:DirectX: - - '**/*DirectX*' - - '**/*DXIL*' - - '**/*dxil*' - - '**/*DirectX*/**' - - '**/*DXIL*/**' - - '**/*dxil*/**' - - '**/*DXContainer*' - - '**/*DXContainer*/**' - - clang/lib/Sema/SemaDirectX.cpp - - clang/include/clang/Sema/SemaDirectX.h - - clang/include/clang/Basic/BuiltinsDirectX.td - - clang/lib/CodeGen/TargetBuiltins/DirectX.cpp - - clang/test/CodeGenDirectX/** - - clang/test/SemaDirectX/** + - changed-files: + - any-glob-to-any-file: + - '**/*DirectX*' + - '**/*DXIL*' + - '**/*dxil*' + - '**/*DirectX*/**' + - '**/*DXIL*/**' + - '**/*dxil*/**' + - '**/*DXContainer*' + - '**/*DXContainer*/**' + - clang/lib/Sema/SemaDirectX.cpp + - clang/include/clang/Sema/SemaDirectX.h + - clang/include/clang/Basic/BuiltinsDirectX.td + - clang/lib/CodeGen/TargetBuiltins/DirectX.cpp + - clang/test/CodeGenDirectX/** + - clang/test/SemaDirectX/** backend:SPIR-V: - - clang/lib/Driver/ToolChains/SPIRV.* - - clang/lib/Sema/SemaSPIRV.cpp - - clang/include/clang/Sema/SemaSPIRV.h - - clang/include/clang/Basic/BuiltinsSPIRV.td - - clang/test/CodeGenSPIRV/** - - clang/test/SemaSPIRV/** - - llvm/lib/Target/SPIRV/** - - llvm/test/CodeGen/SPIRV/** - - llvm/test/Frontend/HLSL/** - - llvm/docs/SPIRVUsage.rst + - changed-files: + - any-glob-to-any-file: + - clang/lib/Driver/ToolChains/SPIRV.* + - clang/lib/Sema/SemaSPIRV.cpp + - clang/include/clang/Sema/SemaSPIRV.h + - clang/include/clang/Basic/BuiltinsSPIRV.td + - clang/test/CodeGenSPIRV/** + - clang/test/SemaSPIRV/** + - llvm/lib/Target/SPIRV/** + - llvm/test/CodeGen/SPIRV/** + - llvm/test/Frontend/HLSL/** + - llvm/docs/SPIRVUsage.rst mlgo: - - llvm/lib/Analysis/ML* - - llvm/include/llvm/Analysis/ML* - - llvm/lib/Analysis/*Runner.cpp - - llvm/include/llvm/Analysis/*Runner.h - - llvm/unittests/Analysis/ML* - - llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp - - llvm/lib/Analysis/TrainingLogger.cpp - - llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h - - llvm/include/llvm/Analysis/Utils/TrainingLogger.h - - llvm/test/Analysis/FunctionPropertiesAnalysis/* - - llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp - - llvm/test/Transforms/inline/ML/** - - llvm/lib/CodeGen/ML* - - llvm/unittests/CodeGen/ML* - - llvm/test/CodeGen/MLRegAlloc/** - - llvm/utils/mlgo-utils/** - - llvm/docs/MLGO.rst - - llvm/include/llvm/Analysis/IR2Vec.h - - llvm/lib/Analysis/IR2Vec.cpp - - llvm/lib/Analysis/models/** - - llvm/include/llvm/CodeGen/MIR2Vec.h - - llvm/lib/CodeGen/MIR2Vec.cpp - - llvm/test/Analysis/IR2Vec/** - - llvm/test/CodeGen/MIR2Vec/** - - llvm/unittests/Analysis/IR2VecTest.cpp - - llvm/unittests/CodeGen/MIR2VecTest.cpp - - llvm/tools/llvm-ir2vec/** - - llvm/docs/CommandGuide/llvm-ir2vec.rst + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Analysis/ML* + - llvm/include/llvm/Analysis/ML* + - llvm/lib/Analysis/*Runner.cpp + - llvm/include/llvm/Analysis/*Runner.h + - llvm/unittests/Analysis/ML* + - llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp + - llvm/lib/Analysis/TrainingLogger.cpp + - llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h + - llvm/include/llvm/Analysis/Utils/TrainingLogger.h + - llvm/test/Analysis/FunctionPropertiesAnalysis/* + - llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp + - llvm/test/Transforms/inline/ML/** + - llvm/lib/CodeGen/ML* + - llvm/unittests/CodeGen/ML* + - llvm/test/CodeGen/MLRegAlloc/** + - llvm/utils/mlgo-utils/** + - llvm/docs/MLGO.rst + - llvm/include/llvm/Analysis/IR2Vec.h + - llvm/lib/Analysis/IR2Vec.cpp + - llvm/lib/Analysis/models/** + - llvm/include/llvm/CodeGen/MIR2Vec.h + - llvm/lib/CodeGen/MIR2Vec.cpp + - llvm/test/Analysis/IR2Vec/** + - llvm/test/CodeGen/MIR2Vec/** + - llvm/unittests/Analysis/IR2VecTest.cpp + - llvm/unittests/CodeGen/MIR2VecTest.cpp + - llvm/tools/llvm-ir2vec/** + - llvm/docs/CommandGuide/llvm-ir2vec.rst tools:llvm-exegesis: - - llvm/tools/llvm-exegesis/** - - llvm/test/tools/llvm-exegesis/** - - llvm/unittests/tools/llvm-exegesis/** + - changed-files: + - any-glob-to-any-file: + - llvm/tools/llvm-exegesis/** + - llvm/test/tools/llvm-exegesis/** + - llvm/unittests/tools/llvm-exegesis/** tools:llvm-reduce: - - llvm/tools/llvm-reduce/** + - changed-files: + - any-glob-to-any-file: + - llvm/tools/llvm-reduce/** platform:windows: - - lld/COFF/** - - clang/lib/Driver/MSVC.cpp - - clang/lib/Driver/MinGW.cpp - - llvm/lib/DebugInfo/CodeView/** - - llvm/lib/DebugInfo/PDB/** - - llvm/lib/WindowsDriver/** - - llvm/lib/Support/Windows/** - - llvm/lib/BinaryFormat/COFF.cpp + - changed-files: + - any-glob-to-any-file: + - lld/COFF/** + - clang/lib/Driver/MSVC.cpp + - clang/lib/Driver/MinGW.cpp + - llvm/lib/DebugInfo/CodeView/** + - llvm/lib/DebugInfo/PDB/** + - llvm/lib/WindowsDriver/** + - llvm/lib/Support/Windows/** + - llvm/lib/BinaryFormat/COFF.cpp llvm:regalloc: - - llvm/**/CodeGen/CalcSpillWeights* - - llvm/**/CodeGen/InlineSpiller* - - llvm/**/CodeGen/InterferenceCache* - - llvm/**/CodeGen/LiveInterval* - - llvm/**/CodeGen/LiveRange* - - llvm/**/CodeGen/LiveReg* - - llvm/**/CodeGen/LiveVariables* - - llvm/**/CodeGen/MachineCopyPropagation* - - llvm/**/CodeGen/PHIElimination* - - llvm/**/CodeGen/ProcessImplicitDefs.cpp - - llvm/**/CodeGen/Register* - - llvm/**/CodeGen/RegUsage* - - llvm/**/CodeGen/RenameIndependentSubregs.cpp - - llvm/**/CodeGen/SlotIndexes.h - - llvm/**/CodeGen/SpillPlacement* - - llvm/**/CodeGen/SplitKit* - - llvm/**/CodeGen/VirtRegMap.h - - llvm/include/PBQP/** - - llvm/include/PBQPRAConstraint.h - - llvm/include/llvm/CodeGen/Spiller.h - - llvm/**/*RegAlloc + - changed-files: + - any-glob-to-any-file: + - llvm/**/CodeGen/CalcSpillWeights* + - llvm/**/CodeGen/InlineSpiller* + - llvm/**/CodeGen/InterferenceCache* + - llvm/**/CodeGen/LiveInterval* + - llvm/**/CodeGen/LiveRange* + - llvm/**/CodeGen/LiveReg* + - llvm/**/CodeGen/LiveVariables* + - llvm/**/CodeGen/MachineCopyPropagation* + - llvm/**/CodeGen/PHIElimination* + - llvm/**/CodeGen/ProcessImplicitDefs.cpp + - llvm/**/CodeGen/Register* + - llvm/**/CodeGen/RegUsage* + - llvm/**/CodeGen/RenameIndependentSubregs.cpp + - llvm/**/CodeGen/SlotIndexes.h + - llvm/**/CodeGen/SpillPlacement* + - llvm/**/CodeGen/SplitKit* + - llvm/**/CodeGen/VirtRegMap.h + - llvm/include/PBQP/** + - llvm/include/PBQPRAConstraint.h + - llvm/include/llvm/CodeGen/Spiller.h + - llvm/**/*RegAlloc lldb: - - lldb/** + - changed-files: + - any-glob-to-any-file: + - lldb/** lldb-dap: - - lldb/tools/lldb-dap/** + - changed-files: + - any-glob-to-any-file: + - lldb/tools/lldb-dap/** backend:AMDGPU: - - '**/*amdgpu*' - - '**/*AMDGPU*' - - '**/*amdgpu*/**' - - '**/*AMDGPU*/**' + - changed-files: + - any-glob-to-any-file: + - '**/*amdgpu*' + - '**/*AMDGPU*' + - '**/*amdgpu*/**' + - '**/*AMDGPU*/**' backend:NVPTX: - - 'llvm/**/*nvvm*' - - 'llvm/**/*NVVM*' - - 'llvm/**/*nvptx*' - - 'llvm/**/*NVPTX*' - - 'llvm/**/*nvvm*/**' - - 'llvm/**/*NVVM*/**' - - 'llvm/**/*nvptx*/**' - - 'llvm/**/*NVPTX*/**' + - changed-files: + - any-glob-to-any-file: + - 'llvm/**/*nvvm*' + - 'llvm/**/*NVVM*' + - 'llvm/**/*nvptx*' + - 'llvm/**/*NVPTX*' + - 'llvm/**/*nvvm*/**' + - 'llvm/**/*NVVM*/**' + - 'llvm/**/*nvptx*/**' + - 'llvm/**/*NVPTX*/**' backend:MIPS: - - '**/*mips*' - - '**/*Mips*' - - '**/*mips*/**' - - '**/*Mips*/**' + - changed-files: + - any-glob-to-any-file: + - '**/*mips*' + - '**/*Mips*' + - '**/*mips*/**' + - '**/*Mips*/**' backend:RISC-V: - - '**/*riscv*' - - '**/*RISCV*' - - '**/*riscv*/**' - - '**/*RISCV*/**' + - changed-files: + - any-glob-to-any-file: + - '**/*riscv*' + - '**/*RISCV*' + - '**/*riscv*/**' + - '**/*RISCV*/**' backend:Xtensa: - - '**/*xtensa*' - - '**/*Xtensa*' - - '**/*xtensa*/**' - - '**/*Xtensa*/**' + - changed-files: + - any-glob-to-any-file: + - '**/*xtensa*' + - '**/*Xtensa*' + - '**/*xtensa*/**' + - '**/*Xtensa*/**' lld:coff: - - lld/**/COFF/** - - lld/Common/** + - changed-files: + - any-glob-to-any-file: + - lld/**/COFF/** + - lld/Common/** lld:elf: - - lld/**/ELF/** - - lld/Common/** + - changed-files: + - any-glob-to-any-file: + - lld/**/ELF/** + - lld/Common/** lld:macho: - - lld/**/MachO/** - - lld/Common/** + - changed-files: + - any-glob-to-any-file: + - lld/**/MachO/** + - lld/Common/** lld:wasm: - - lld/**/wasm/** - - lld/Common/** + - changed-files: + - any-glob-to-any-file: + - lld/**/wasm/** + - lld/Common/** backend:ARC: - - llvm/lib/Target/ARC/** - - clang/lib/Basic/Targets/ARC.h - - clang/lib/Basic/Targets/ARC.cpp - - clang/lib/CodeGen/Targets/ARC.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Target/ARC/** + - clang/lib/Basic/Targets/ARC.h + - clang/lib/Basic/Targets/ARC.cpp + - clang/lib/CodeGen/Targets/ARC.cpp backend:ARM: - - llvm/include/llvm/IR/IntrinsicsARM.td - - llvm/test/MC/ARM/** - - llvm/lib/Target/ARM/** - - llvm/test/CodeGen/ARM/** - - clang/lib/Basic/Targets/ARM* - - clang/lib/Driver/ToolChains/Arch/ARM.* - - clang/lib/CodeGen/Targets/ARM.cpp - - clang/include/clang/Basic/BuiltinsARM* - - llvm/test/MC/DisasemblerARM/** - - clang/include/clang/Sema/SemaARM.h - - clang/lib/Sema/SemaARM.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/IR/IntrinsicsARM.td + - llvm/test/MC/ARM/** + - llvm/lib/Target/ARM/** + - llvm/test/CodeGen/ARM/** + - clang/lib/Basic/Targets/ARM* + - clang/lib/Driver/ToolChains/Arch/ARM.* + - clang/lib/CodeGen/Targets/ARM.cpp + - clang/include/clang/Basic/BuiltinsARM* + - llvm/test/MC/DisasemblerARM/** + - clang/include/clang/Sema/SemaARM.h + - clang/lib/Sema/SemaARM.cpp backend:AArch64: - - llvm/include/llvm/IR/IntrinsicsAArch64.td - - llvm/test/MC/AArch64/** - - llvm/lib/Target/AArch64/** - - llvm/test/CodeGen/AArch64/** - - clang/lib/Basic/Targets/AArch64* - - clang/lib/Driver/ToolChains/Arch/AArch64.* - - clang/lib/CodeGen/Targets/AArch64.cpp - - clang/include/clang/Basic/BuiltinsAArch64* - - llvm/test/MC/Disassembler/AArch64/** - - clang/include/clang/Sema/SemaARM.h - - clang/lib/Sema/SemaARM.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/IR/IntrinsicsAArch64.td + - llvm/test/MC/AArch64/** + - llvm/lib/Target/AArch64/** + - llvm/test/CodeGen/AArch64/** + - clang/lib/Basic/Targets/AArch64* + - clang/lib/Driver/ToolChains/Arch/AArch64.* + - clang/lib/CodeGen/Targets/AArch64.cpp + - clang/include/clang/Basic/BuiltinsAArch64* + - llvm/test/MC/Disassembler/AArch64/** + - clang/include/clang/Sema/SemaARM.h + - clang/lib/Sema/SemaARM.cpp backend:CSKY: - - llvm/lib/Target/CSKY/** - - llvm/include/llvm/TargetParser/CSKYTargetParser.def - - llvm/include/llvm/TargetParser/CSKYTargetParser.h - - llvm/include/llvm/BinaryFormat/ELFRelocs/CSKY.def - - llvm/lib/TargetParser/CSKYTargetParser.cpp - - llvm/lib/Support/CSKYAttributes.cpp - - llvm/lib/Support/CSKYAttributeParser.cpp - - clang/lib/Basic/Targets/CSKY.h - - clang/lib/Basic/Targets/CSKY.cpp - - clang/lib/CodeGen/Targets/CSKY.cpp - - clang/lib/Driver/ToolChains/CSKY* + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Target/CSKY/** + - llvm/include/llvm/TargetParser/CSKYTargetParser.def + - llvm/include/llvm/TargetParser/CSKYTargetParser.h + - llvm/include/llvm/BinaryFormat/ELFRelocs/CSKY.def + - llvm/lib/TargetParser/CSKYTargetParser.cpp + - llvm/lib/Support/CSKYAttributes.cpp + - llvm/lib/Support/CSKYAttributeParser.cpp + - clang/lib/Basic/Targets/CSKY.h + - clang/lib/Basic/Targets/CSKY.cpp + - clang/lib/CodeGen/Targets/CSKY.cpp + - clang/lib/Driver/ToolChains/CSKY* backend:Hexagon: - - clang/include/clang/Basic/BuiltinsHexagon*.def - - clang/include/clang/Sema/SemaHexagon.h - - clang/lib/Basic/Targets/Hexagon.* - - clang/lib/CodeGen/Targets/Hexagon.cpp - - clang/lib/Driver/ToolChains/Hexagon.* - - clang/lib/Sema/SemaHexagon.cpp - - lld/ELF/Arch/Hexagon.cpp - - lldb/source/Plugins/ABI/Hexagon/** - - lldb/source/Plugins/DynamicLoader/Hexagon-DYLD/** - - llvm/include/llvm/BinaryFormat/ELFRelocs/Hexagon.def - - llvm/include/llvm/IR/IntrinsicsHexagon* - - llvm/include/llvm/Support/Hexagon* - - llvm/lib/Support/Hexagon* - - llvm/lib/Target/Hexagon/** - - llvm/test/CodeGen/Hexagon/** - - llvm/test/CodeGen/*/Hexagon/** - - llvm/test/DebugInfo/*/Hexagon/** - - llvm/test/Transforms/*/Hexagon - - llvm/test/MC/Disassembler/Hexagon/** - - llvm/test/MC/Hexagon/** - - llvm/test/tools/llvm-objdump/ELF/Hexagon/** + - changed-files: + - any-glob-to-any-file: + - clang/include/clang/Basic/BuiltinsHexagon*.def + - clang/include/clang/Sema/SemaHexagon.h + - clang/lib/Basic/Targets/Hexagon.* + - clang/lib/CodeGen/Targets/Hexagon.cpp + - clang/lib/Driver/ToolChains/Hexagon.* + - clang/lib/Sema/SemaHexagon.cpp + - lld/ELF/Arch/Hexagon.cpp + - lldb/source/Plugins/ABI/Hexagon/** + - lldb/source/Plugins/DynamicLoader/Hexagon-DYLD/** + - llvm/include/llvm/BinaryFormat/ELFRelocs/Hexagon.def + - llvm/include/llvm/IR/IntrinsicsHexagon* + - llvm/include/llvm/Support/Hexagon* + - llvm/lib/Support/Hexagon* + - llvm/lib/Target/Hexagon/** + - llvm/test/CodeGen/Hexagon/** + - llvm/test/CodeGen/*/Hexagon/** + - llvm/test/DebugInfo/*/Hexagon/** + - llvm/test/Transforms/*/Hexagon + - llvm/test/MC/Disassembler/Hexagon/** + - llvm/test/MC/Hexagon/** + - llvm/test/tools/llvm-objdump/ELF/Hexagon/** backend:Lanai: - - llvm/lib/Target/Lanai/** - - clang/lib/Basic/Targets/Lanai.h - - clang/lib/Basic/Targets/Lanai.cpp - - clang/lib/CodeGen/Targets/Lanai.cpp - - clang/lib/Driver/ToolChains/Lanai* + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Target/Lanai/** + - clang/lib/Basic/Targets/Lanai.h + - clang/lib/Basic/Targets/Lanai.cpp + - clang/lib/CodeGen/Targets/Lanai.cpp + - clang/lib/Driver/ToolChains/Lanai* backend:loongarch: - - llvm/include/llvm/IR/IntrinsicsLoongArch.td - - llvm/test/MC/LoongArch/** - - llvm/lib/Target/LoongArch/** - - llvm/test/CodeGen/LoongArch/** - - clang/lib/Basic/Targets/LoongArch* - - clang/lib/Driver/ToolChains/Arch/LoongArch.* - - clang/lib/CodeGen/Targets/LoongArch.cpp - - clang/include/clang/Basic/BuiltinsLoongArch* - - clang/include/clang/Sema/SemaLoongArch.h - - clang/lib/Sema/SemaLoongArch.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/IR/IntrinsicsLoongArch.td + - llvm/test/MC/LoongArch/** + - llvm/lib/Target/LoongArch/** + - llvm/test/CodeGen/LoongArch/** + - clang/lib/Basic/Targets/LoongArch* + - clang/lib/Driver/ToolChains/Arch/LoongArch.* + - clang/lib/CodeGen/Targets/LoongArch.cpp + - clang/include/clang/Basic/BuiltinsLoongArch* + - clang/include/clang/Sema/SemaLoongArch.h + - clang/lib/Sema/SemaLoongArch.cpp backend:MSP430: - - llvm/include/llvm/IR/IntrinsicsMSP430.td - - llvm/test/MC/MSP430/** - - llvm/lib/Target/MSP430/** - - llvm/test/CodeGen/MSP430/** - - clang/lib/Basic/Targets/MSP430* - - clang/lib/Driver/ToolChains/Arch/MSP430.* - - clang/lib/CodeGen/Targets/MSP430.cpp - - clang/include/clang/Basic/BuiltinsMSP430* - - llvm/test/MC/Disassembler/MSP430/** + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/IR/IntrinsicsMSP430.td + - llvm/test/MC/MSP430/** + - llvm/lib/Target/MSP430/** + - llvm/test/CodeGen/MSP430/** + - clang/lib/Basic/Targets/MSP430* + - clang/lib/Driver/ToolChains/Arch/MSP430.* + - clang/lib/CodeGen/Targets/MSP430.cpp + - clang/include/clang/Basic/BuiltinsMSP430* + - llvm/test/MC/Disassembler/MSP430/** backend:Sparc: - - llvm/include/llvm/IR/IntrinsicsSparc.td - - llvm/test/MC/Sparc/** - - llvm/lib/Target/Sparc/** - - llvm/test/CodeGen/Sparc/** - - clang/lib/Basic/Targets/Sparc* - - clang/lib/Driver/ToolChains/Arch/Sparc.* - - clang/lib/CodeGen/Targets/Sparc.cpp - - clang/include/clang/Basic/BuiltinsSparc* - - llvm/test/MC/Disassembler/Sparc/** + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/IR/IntrinsicsSparc.td + - llvm/test/MC/Sparc/** + - llvm/lib/Target/Sparc/** + - llvm/test/CodeGen/Sparc/** + - clang/lib/Basic/Targets/Sparc* + - clang/lib/Driver/ToolChains/Arch/Sparc.* + - clang/lib/CodeGen/Targets/Sparc.cpp + - clang/include/clang/Basic/BuiltinsSparc* + - llvm/test/MC/Disassembler/Sparc/** backend:WebAssembly: - - llvm/lib/Target/WebAssembly/** - - llvm/test/CodeGen/WebAssembly/** - - clang/lib/Basic/Targets/WebAssembly* - - clang/include/clang/Basic/BuiltinsWebAssembly.def - - clang/include/clang/Basic/WebAssemblyReferenceTypes.def - - clang/lib/CodeGen/Targets/WebAssembly* - - llvm/include/llvm/IR/IntinsicsWebAssembly.td - - llvm/include/llvm/Object/Wasm* - - llvm/lib/CodeGen/AsmPrinter/Wasm* - - llvm/lib/CodeGen/Wasm* - - llvm/lib/MC/MCParser/Wasm* - - llvm/lib/MC/Wasm* - - llvm/lib/ObjCopy/wasm/** - - llvm/lib/Object/Wasm* - - clang/lib/Driver/Toolchains/WebAssembly* - - clang/lib/Headers/wasm_simd128.h - - clang/test/CodeGen/WebAssembly/** - - clang/test/SemaCXX/*wasm* - - clang/test/Sema/*wasm* - - llvm/include/llvm/BinaryFormat/Wasm.h - - llvm/unittests/Target/WebAssembly/** - - llvm/test/DebugInfo/WebAssembly/** - - llvm/test/MC/WebAssembly/** - - clang/include/clang/Sema/SemaWasm.h - - clang/lib/Sema/SemaLoongWasm.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/lib/Target/WebAssembly/** + - llvm/test/CodeGen/WebAssembly/** + - clang/lib/Basic/Targets/WebAssembly* + - clang/include/clang/Basic/BuiltinsWebAssembly.def + - clang/include/clang/Basic/WebAssemblyReferenceTypes.def + - clang/lib/CodeGen/Targets/WebAssembly* + - llvm/include/llvm/IR/IntinsicsWebAssembly.td + - llvm/include/llvm/Object/Wasm* + - llvm/lib/CodeGen/AsmPrinter/Wasm* + - llvm/lib/CodeGen/Wasm* + - llvm/lib/MC/MCParser/Wasm* + - llvm/lib/MC/Wasm* + - llvm/lib/ObjCopy/wasm/** + - llvm/lib/Object/Wasm* + - clang/lib/Driver/Toolchains/WebAssembly* + - clang/lib/Headers/wasm_simd128.h + - clang/test/CodeGen/WebAssembly/** + - clang/test/SemaCXX/*wasm* + - clang/test/Sema/*wasm* + - llvm/include/llvm/BinaryFormat/Wasm.h + - llvm/unittests/Target/WebAssembly/** + - llvm/test/DebugInfo/WebAssembly/** + - llvm/test/MC/WebAssembly/** + - clang/include/clang/Sema/SemaWasm.h + - clang/lib/Sema/SemaLoongWasm.cpp backend:X86: - - llvm/include/llvm/IR/IntrinsicsX86.td - - llvm/lib/Target/X86/** - - llvm/test/CodeGen/X86/** - - llvm/test/MC/X86/** - - llvm/test/MC/Disassembler/X86/** - - llvm/test/Analysis/CostModel/X86/** - - llvm/test/tools/llvm-mca/X86/** - - clang/lib/Basic/Targets/X86/** - - clang/lib/Driver/ToolChains/Arch/X86.* - - clang/lib/CodeGen/Targets/X86.* - - clang/lib/Headers/** - - clang/test/CodeGen/X86/** - - clang/include/clang/Basic/BuiltinsX86* - - llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h - - llvm/include/llvm/TargetParser/X86* - - llvm/lib/TargetParser/X86* - - llvm/utils/TableGen/X86* - - clang/include/clang/Sema/SemaX86.h - - clang/lib/Sema/SemaX86.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/IR/IntrinsicsX86.td + - llvm/lib/Target/X86/** + - llvm/test/CodeGen/X86/** + - llvm/test/MC/X86/** + - llvm/test/MC/Disassembler/X86/** + - llvm/test/Analysis/CostModel/X86/** + - llvm/test/tools/llvm-mca/X86/** + - clang/lib/Basic/Targets/X86/** + - clang/lib/Driver/ToolChains/Arch/X86.* + - clang/lib/CodeGen/Targets/X86.* + - clang/lib/Headers/** + - clang/test/CodeGen/X86/** + - clang/include/clang/Basic/BuiltinsX86* + - llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h + - llvm/include/llvm/TargetParser/X86* + - llvm/lib/TargetParser/X86* + - llvm/utils/TableGen/X86* + - clang/include/clang/Sema/SemaX86.h + - clang/lib/Sema/SemaX86.cpp backend:PowerPC: - - llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC* - - llvm/include/llvm/BinaryFormat/XCOFF.h - - llvm/include/llvm/IR/IntrinsicsPowerPC.td - - llvm/lib/CodeGen/AsmPrinter/AIXException.cpp - - llvm/lib/Target/PowerPC/** - - llvm/test/Analysis/**/PowerPC/** - - llvm/test/CodeGen/PowerPC/** - - llvm/test/CodeGen/MIR/PowerPC/** - - llvm/test/DebugInfo/XCOFF/** - - llvm/test/DebugInfo/PowerPC/** - - llvm/test/LTO/PowerPC/** - - llvm/test/MC/Disassembler/PowerPC/** - - llvm/test/MC/PowerPC/** - - llvm/test/MC/XCOFF/** - - llvm/test/Transforms/**/PowerPC/** - - clang/include/clang/Basic/BuiltinsPPC.* - - clang/lib/Basic/Targets/PPC.* - - clang/lib/CodeGen/Targets/PPC.cpp - - clang/lib/Driver/ToolChains/PPC* - - clang/lib/Driver/ToolChains/AIX* - - clang/lib/Driver/ToolChains/Arch/PPC.* - - clang/test/CodeGen/PowerPC/** - - clang/include/clang/Sema/SemaPPC.h - - clang/lib/Sema/SemaPPC.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/BinaryFormat/ELFRelocs/PowerPC* + - llvm/include/llvm/BinaryFormat/XCOFF.h + - llvm/include/llvm/IR/IntrinsicsPowerPC.td + - llvm/lib/CodeGen/AsmPrinter/AIXException.cpp + - llvm/lib/Target/PowerPC/** + - llvm/test/Analysis/**/PowerPC/** + - llvm/test/CodeGen/PowerPC/** + - llvm/test/CodeGen/MIR/PowerPC/** + - llvm/test/DebugInfo/XCOFF/** + - llvm/test/DebugInfo/PowerPC/** + - llvm/test/LTO/PowerPC/** + - llvm/test/MC/Disassembler/PowerPC/** + - llvm/test/MC/PowerPC/** + - llvm/test/MC/XCOFF/** + - llvm/test/Transforms/**/PowerPC/** + - clang/include/clang/Basic/BuiltinsPPC.* + - clang/lib/Basic/Targets/PPC.* + - clang/lib/CodeGen/Targets/PPC.cpp + - clang/lib/Driver/ToolChains/PPC* + - clang/lib/Driver/ToolChains/AIX* + - clang/lib/Driver/ToolChains/Arch/PPC.* + - clang/test/CodeGen/PowerPC/** + - clang/include/clang/Sema/SemaPPC.h + - clang/lib/Sema/SemaPPC.cpp backend:SystemZ: - - llvm/include/llvm/BinaryFormat/ELFRelocs/SystemZ* - - llvm/include/llvm/BinaryFormat/GOFF.h - - llvm/include/llvm/IR/IntrinsicsSystemZ.td - - llvm/lib/Target/SystemZ/** - - llvm/test/Analysis/**/SystemZ/** - - llvm/test/CodeGen/SystemZ/** - - llvm/test/DebugInfo/SystemZ/** - - llvm/test/ExecutionEngine/**/SystemZ/** - - llvm/test/MC/Disassembler/SystemZ/** - - llvm/test/MC/GOFF/** - - llvm/test/MC/SystemZ/** - - llvm/test/Transforms/**/SystemZ/** - - clang/include/clang/Basic/BuiltinsSystemZ.* - - clang/lib/Basic/Targets/SystemZ.* - - clang/lib/CodeGen/Targets/SystemZ.cpp - - clang/lib/Driver/ToolChains/ZOS* - - clang/lib/Driver/ToolChains/Arch/SystemZ.* - - clang/test/CodeGen/SystemZ/** - - clang/include/clang/Sema/SemaSystemZ.h - - clang/lib/Sema/SemaSystemZ.cpp + - changed-files: + - any-glob-to-any-file: + - llvm/include/llvm/BinaryFormat/ELFRelocs/SystemZ* + - llvm/include/llvm/BinaryFormat/GOFF.h + - llvm/include/llvm/IR/IntrinsicsSystemZ.td + - llvm/lib/Target/SystemZ/** + - llvm/test/Analysis/**/SystemZ/** + - llvm/test/CodeGen/SystemZ/** + - llvm/test/DebugInfo/SystemZ/** + - llvm/test/ExecutionEngine/**/SystemZ/** + - llvm/test/MC/Disassembler/SystemZ/** + - llvm/test/MC/GOFF/** + - llvm/test/MC/SystemZ/** + - llvm/test/Transforms/**/SystemZ/** + - clang/include/clang/Basic/BuiltinsSystemZ.* + - clang/lib/Basic/Targets/SystemZ.* + - clang/lib/CodeGen/Targets/SystemZ.cpp + - clang/lib/Driver/ToolChains/ZOS* + - clang/lib/Driver/ToolChains/Arch/SystemZ.* + - clang/test/CodeGen/SystemZ/** + - clang/include/clang/Sema/SemaSystemZ.h + - clang/lib/Sema/SemaSystemZ.cpp third-party:unittests: - - third-party/unittests/** + - changed-files: + - any-glob-to-any-file: + - third-party/unittests/** third-party:benchmark: - - third-party/benchmark/** + - changed-files: + - any-glob-to-any-file: + - third-party/benchmark/** llvm:binary-utilities: - - llvm/docs/CommandGuide/llvm-* - - llvm/include/llvm/BinaryFormat/** - - llvm/include/llvm/DebugInfo/Symbolize/** - - llvm/include/llvm/ObjCopy/** - - llvm/include/llvm/Object/** - - llvm/lib/BinaryFormat/** - - llvm/lib/DebugInfo/Symbolize/** - - llvm/lib/ObjCopy/** - - llvm/lib/Object/** - - llvm/test/Object/** - - llvm/test/tools/llvm-ar/** - - llvm/test/tools/llvm-cxxfilt/** - - llvm/test/tools/llvm-nm/** - - llvm/test/tools/llvm-objcopy/** - - llvm/test/tools/llvm-objdump/** - - llvm/test/tools/llvm-readobj/** - - llvm/test/tools/llvm-size/** - - llvm/test/tools/llvm-strings/** - - llvm/test/tools/llvm-symbolizer/** - - llvm/tools/llvm-ar/** - - llvm/tools/llvm-cxxfilt/** - - llvm/tools/llvm-nm/** - - llvm/tools/llvm-objcopy/** - - llvm/tools/llvm-objdump/** - - llvm/tools/llvm-readobj/** - - llvm/tools/llvm-size/** - - llvm/tools/llvm-strings/** - - llvm/tools/llvm-symbolizer/** + - changed-files: + - any-glob-to-any-file: + - llvm/docs/CommandGuide/llvm-* + - llvm/include/llvm/BinaryFormat/** + - llvm/include/llvm/DebugInfo/Symbolize/** + - llvm/include/llvm/ObjCopy/** + - llvm/include/llvm/Object/** + - llvm/lib/BinaryFormat/** + - llvm/lib/DebugInfo/Symbolize/** + - llvm/lib/ObjCopy/** + - llvm/lib/Object/** + - llvm/test/Object/** + - llvm/test/tools/llvm-ar/** + - llvm/test/tools/llvm-cxxfilt/** + - llvm/test/tools/llvm-nm/** + - llvm/test/tools/llvm-objcopy/** + - llvm/test/tools/llvm-objdump/** + - llvm/test/tools/llvm-readobj/** + - llvm/test/tools/llvm-size/** + - llvm/test/tools/llvm-strings/** + - llvm/test/tools/llvm-symbolizer/** + - llvm/tools/llvm-ar/** + - llvm/tools/llvm-cxxfilt/** + - llvm/tools/llvm-nm/** + - llvm/tools/llvm-objcopy/** + - llvm/tools/llvm-objdump/** + - llvm/tools/llvm-readobj/** + - llvm/tools/llvm-size/** + - llvm/tools/llvm-strings/** + - llvm/tools/llvm-symbolizer/** clang:openmp: - - clang/include/clang/Basic/OpenMP* - - clang/include/clang/AST/OpenMPClause.h - - clang/include/clang/AST/DeclOpenMP.h - - clang/include/clang/AST/ExprOpenMP.h - - clang/include/clang/AST/StmtOpenMP.h - - clang/lib/AST/DeclOpenMP.cpp - - clang/lib/AST/OpenMPClause.cpp - - clang/lib/AST/StmtOpenMP.cpp - - clang/lib/Headers/openmp_wrappers/** - - clang/lib/Parse/ParseOpenMP.cpp - - clang/lib/Basic/OpenMPKinds.cpp - - clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp - - clang/lib/Driver/ToolChains/AMDGPUOpenMP.h - - clang/lib/CodeGen/CgStmtOpenMP.cpp - - clang/lib/CodeGen/CGOpenMP* - - clang/lib/Sema/SemaOpenMP.cpp - - clang/test/OpenMP/** - - clang/test/AST/ast-dump-openmp-* - - llvm/lib/Frontend/OpenMP/** - - llvm/lib/Transforms/IPO/OpenMPOpt.cpp - - llvm/include/llvm/Frontend/OpenMP/** - - llvm/include/llvm/Transforms/IPO/OpenMPOpt.h - - llvm/unittests/Frontend/OpenMP* - - llvm/test/Transforms/OpenMP/** + - changed-files: + - any-glob-to-any-file: + - clang/include/clang/Basic/OpenMP* + - clang/include/clang/AST/OpenMPClause.h + - clang/include/clang/AST/DeclOpenMP.h + - clang/include/clang/AST/ExprOpenMP.h + - clang/include/clang/AST/StmtOpenMP.h + - clang/lib/AST/DeclOpenMP.cpp + - clang/lib/AST/OpenMPClause.cpp + - clang/lib/AST/StmtOpenMP.cpp + - clang/lib/Headers/openmp_wrappers/** + - clang/lib/Parse/ParseOpenMP.cpp + - clang/lib/Basic/OpenMPKinds.cpp + - clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp + - clang/lib/Driver/ToolChains/AMDGPUOpenMP.h + - clang/lib/CodeGen/CgStmtOpenMP.cpp + - clang/lib/CodeGen/CGOpenMP* + - clang/lib/Sema/SemaOpenMP.cpp + - clang/test/OpenMP/** + - clang/test/AST/ast-dump-openmp-* + - llvm/lib/Frontend/OpenMP/** + - llvm/lib/Transforms/IPO/OpenMPOpt.cpp + - llvm/include/llvm/Frontend/OpenMP/** + - llvm/include/llvm/Transforms/IPO/OpenMPOpt.h + - llvm/unittests/Frontend/OpenMP* + - llvm/test/Transforms/OpenMP/** clang:temporal-safety: - - clang/include/clang/Analysis/Analyses/LifetimeSafety/** - - clang/lib/Analysis/LifetimeSafety/** - - clang/unittests/Analysis/LifetimeSafety* - - clang/test/Sema/*lifetime-safety* - - clang/test/Sema/*lifetime-analysis* - - clang/test/Analysis/LifetimeSafety/** + - changed-files: + - any-glob-to-any-file: + - clang/include/clang/Analysis/Analyses/LifetimeSafety/** + - clang/lib/Analysis/LifetimeSafety/** + - clang/unittests/Analysis/LifetimeSafety* + - clang/test/Sema/*lifetime-safety* + - clang/test/Sema/*lifetime-analysis* + - clang/test/Analysis/LifetimeSafety/** clang:as-a-library: - - clang/tools/libclang/** - - clang/bindings/** - - clang/include/clang-c/** - - clang/test/LibClang/** - - clang/unittest/libclang/** + - changed-files: + - any-glob-to-any-file: + - clang/tools/libclang/** + - clang/bindings/** + - clang/include/clang-c/** + - clang/test/LibClang/** + - clang/unittest/libclang/** openmp:libomp: - - any: ['openmp/**', '!openmp/libomptarget/**'] + - changed-files: + - any-glob-to-any-file: + - 'openmp/**' openmp:libomptarget: - - any: ['openmp/**', '!openmp/runtime/**'] + - changed-files: + - all-globs-to-all-files: + - openmp/** + - '!openmp/runtime/**' bazel: - - utils/bazel/** + - changed-files: + - any-glob-to-any-file: + - utils/bazel/** offload: - - offload/** + - changed-files: + - any-glob-to-any-file: + - offload/** tablegen: - - llvm/include/TableGen/** - - llvm/lib/TableGen/** - - llvm/utils/TableGen/** + - changed-files: + - any-glob-to-any-file: + - llvm/include/TableGen/** + - llvm/lib/TableGen/** + - llvm/utils/TableGen/** infrastructure: - - .ci/** + - changed-files: + - any-glob-to-any-file: + - .ci/** diff --git a/.github/workflows/new-prs.yml b/.github/workflows/new-prs.yml index e1f2e754c1a3d..f5826728d2c7b 100644 --- a/.github/workflows/new-prs.yml +++ b/.github/workflows/new-prs.yml @@ -26,7 +26,6 @@ jobs: # See https://docs.github.com/en/webhooks/webhook-events-and-payloads?actionType=opened#pull_request # for all the possible values. if: >- - (github.repository == 'llvm/llvm-project') && (github.event.action == 'opened') && (github.event.pull_request.author_association != 'COLLABORATOR') && (github.event.pull_request.author_association != 'CONTRIBUTOR') && @@ -67,9 +66,7 @@ jobs: github.event.pull_request.draft == false && github.event.pull_request.commits < 10 steps: - - uses: actions/labeler@ac9175f8a1f3625fd0d4fb234536d26811351594 # v4.3.0 + - uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6.0.1 with: configuration-path: .github/new-prs-labeler.yml - # workaround for https://github.com/actions/labeler/issues/112 - sync-labels: '' repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }} diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 8b6656834cc06..64f371e9f8db8 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -188,9 +188,6 @@ jobs: with: ref: ${{ needs.prepare.outputs.ref }} - - name: Install Ninja - uses: llvm/actions/install-ninja@5dd955034a6742a2e21d82bf165fcb1050ae7b49 # main - - name: Set Build Prefix id: setup-stage shell: bash diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg index 8a61d11f5825f..78cc73e53b680 100644 --- a/bolt/test/lit.local.cfg +++ b/bolt/test/lit.local.cfg @@ -5,7 +5,7 @@ if not "linux" in host_triple: host_triple = host_triple.split("-")[0] + "-unknown-linux-gnu" common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie" -flags = f"--target={host_triple} -fPIE {common_linker_flags}" +flags = f"--target={host_triple} -fPIE {common_linker_flags} -mllvm -x86-asm-syntax=att" config.substitutions.insert(0, ("%cflags", f"%cflags {flags}")) config.substitutions.insert(0, ("%cxxflags", f"%cxxflags {flags}")) diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp index 47e859d21e451..c438889e22ab7 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp @@ -248,6 +248,12 @@ void AvoidCStyleCastsCheck::check(const MatchFinder::MatchResult &Result) { } break; } + if (DestType->isVoidPointerType() && SourceType->isPointerType() && + !SourceType->getPointeeType()->isPointerType()) { + ReplaceWithNamedCast("reinterpret_cast"); + return; + } + [[fallthrough]]; case clang::CK_IntegralCast: // Convert integral and no-op casts between builtin types and enums to diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 743397e3ec6ce..dc8abd88899a4 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -426,7 +426,7 @@ Changes in existing checks - Improved :doc:`google-readability-casting ` check by adding fix-it - notes for downcasts. + notes for downcasts and casts to void pointer. - Improved :doc:`google-readability-todo ` check to accept the new TODO diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 66fe6010a73dc..8bb112f3d1832 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -462,9 +462,9 @@ Check aliases :doc:`cert-mem57-cpp `, :doc:`bugprone-default-operator-new-on-overaligned-type `, :doc:`cert-msc24-c `, :doc:`bugprone-unsafe-functions `, :doc:`cert-msc30-c `, :doc:`misc-predictable-rand `, - :doc:`cert-msc50-cpp `, :doc:`misc-predictable-rand `, :doc:`cert-msc32-c `, :doc:`bugprone-random-generator-seed `, :doc:`cert-msc33-c `, :doc:`bugprone-unsafe-functions `, + :doc:`cert-msc50-cpp `, :doc:`misc-predictable-rand `, :doc:`cert-msc51-cpp `, :doc:`bugprone-random-generator-seed `, :doc:`cert-msc54-cpp `, :doc:`bugprone-signal-handler `, :doc:`cert-oop11-cpp `, :doc:`performance-move-constructor-init `, diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp index f9feb8854249b..d8e8c5017a9b2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp @@ -108,6 +108,10 @@ void f(int a, double b, const char *cpc, const void *cpv, X *pX) { // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: {{.*}}; use static_cast {{.*}} // CHECK-FIXES: Y &rB = static_cast(*pX); + void *vp = (void *) pX; + // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: {{.*}}; use reinterpret_cast + // CHECK-FIXES: void *vp = reinterpret_cast(pX); + const char *pc3 = (const char*)cpv; // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: {{.*}}; use static_cast [ // CHECK-FIXES: const char *pc3 = static_cast(cpv); diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst index 1a740e5e22c29..3f319e8be6421 100644 --- a/clang/docs/AllocToken.rst +++ b/clang/docs/AllocToken.rst @@ -52,8 +52,8 @@ change or removal. These may (experimentally) be selected with ``-Xclang The following command-line options affect generated token IDs: * ``-falloc-token-max=`` - Configures the maximum number of tokens. No max by default (tokens bounded - by ``SIZE_MAX``). + Configures the maximum number of token IDs. By default the number of tokens + is bounded by ``SIZE_MAX``. Querying Token IDs with ``__builtin_infer_alloc_token`` ======================================================= @@ -129,7 +129,7 @@ Fast ABI -------- An alternative ABI can be enabled with ``-fsanitize-alloc-token-fast-abi``, -which encodes the token ID hint in the allocation function name. +which encodes the token ID in the allocation function name. .. code-block:: c diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h index b9cad5340c940..b5f7f8746186a 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Facts.h @@ -42,12 +42,12 @@ class Fact { /// it. Otherwise, the source's loan set is merged into the destination's /// loan set. OriginFlow, - /// An origin escapes the function by flowing into the return value. - ReturnOfOrigin, /// An origin is used (eg. appears as l-value expression like DeclRefExpr). Use, /// A marker for a specific point in the code, for testing. TestPoint, + /// An origin that escapes the function scope (e.g., via return). + OriginEscapes, }; private: @@ -136,16 +136,19 @@ class OriginFlowFact : public Fact { const OriginManager &OM) const override; }; -class ReturnOfOriginFact : public Fact { +class OriginEscapesFact : public Fact { OriginID OID; + const Expr *EscapeExpr; public: static bool classof(const Fact *F) { - return F->getKind() == Kind::ReturnOfOrigin; + return F->getKind() == Kind::OriginEscapes; } - ReturnOfOriginFact(OriginID OID) : Fact(Kind::ReturnOfOrigin), OID(OID) {} - OriginID getReturnedOriginID() const { return OID; } + OriginEscapesFact(OriginID OID, const Expr *EscapeExpr) + : Fact(Kind::OriginEscapes), OID(OID), EscapeExpr(EscapeExpr) {} + OriginID getEscapedOriginID() const { return OID; } + const Expr *getEscapeExpr() const { return EscapeExpr; }; void dump(llvm::raw_ostream &OS, const LoanManager &, const OriginManager &OM) const override; }; @@ -225,6 +228,9 @@ class FactManager { /// user-defined locations in the code. /// \note This is intended for testing only. llvm::StringMap getTestPoints() const; + /// Retrieves all the facts in the block containing Program Point P. + /// \note This is intended for testing only. + llvm::ArrayRef getBlockContaining(ProgramPoint P) const; unsigned getNumFacts() const { return NextFactID.Value; } diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h index 4c8ab3f859a49..8ea37259c570b 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h @@ -94,6 +94,10 @@ class FactsGenerator : public ConstStmtVisitor { FactManager &FactMgr; AnalysisDeclContext &AC; llvm::SmallVector CurrentBlockFacts; + // Collect origins that escape the function in this block (OriginEscapesFact), + // appended at the end of CurrentBlockFacts to ensure they appear after + // ExpireFact entries. + llvm::SmallVector EscapesInCurrentBlock; // To distinguish between reads and writes for use-after-free checks, this map // stores the `UseFact` for each `DeclRefExpr`. We initially identify all // `DeclRefExpr`s as "read" uses. When an assignment is processed, the use diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 91ffbb169f947..b34a7f18b5809 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -42,6 +42,11 @@ class LifetimeSafetyReporter { virtual void reportUseAfterFree(const Expr *IssueExpr, const Expr *UseExpr, SourceLocation FreeLoc, Confidence Confidence) {} + + virtual void reportUseAfterReturn(const Expr *IssueExpr, + const Expr *EscapeExpr, + SourceLocation ExpiryLoc, + Confidence Confidence) {} }; /// The main entry point for the analysis. diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h index c4f5f0e9ae46c..8ad17db83499d 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h @@ -31,6 +31,9 @@ namespace clang::lifetimes::internal { +using CausingFactType = + ::llvm::PointerUnion; + enum class LivenessKind : uint8_t { Dead, // Not alive Maybe, // Live on some path but not all paths (may-be-live) @@ -43,7 +46,7 @@ struct LivenessInfo { /// multiple uses along different paths, this will point to the use appearing /// earlier in the translation unit. /// This is 'null' when the origin is not live. - const UseFact *CausingUseFact; + CausingFactType CausingFact; /// The kind of liveness of the origin. /// `Must`: The origin is live on all control-flow paths from the current @@ -56,17 +59,16 @@ struct LivenessInfo { /// while `Maybe`-be-alive suggests a potential one on some paths. LivenessKind Kind; - LivenessInfo() : CausingUseFact(nullptr), Kind(LivenessKind::Dead) {} - LivenessInfo(const UseFact *UF, LivenessKind K) - : CausingUseFact(UF), Kind(K) {} + LivenessInfo() : CausingFact(nullptr), Kind(LivenessKind::Dead) {} + LivenessInfo(CausingFactType CF, LivenessKind K) : CausingFact(CF), Kind(K) {} bool operator==(const LivenessInfo &Other) const { - return CausingUseFact == Other.CausingUseFact && Kind == Other.Kind; + return CausingFact == Other.CausingFact && Kind == Other.Kind; } bool operator!=(const LivenessInfo &Other) const { return !(*this == Other); } void Profile(llvm::FoldingSetNodeID &IDBuilder) const { - IDBuilder.AddPointer(CausingUseFact); + IDBuilder.AddPointer(CausingFact.getOpaqueValue()); IDBuilder.Add(Kind); } }; diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index ad448766e665f..6fbd2222ab289 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -579,6 +579,10 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float) def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; +def __nvvm_ff2bf16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2bf16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2bf16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2bf16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; def __nvvm_ff2bf16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; @@ -596,6 +600,10 @@ def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)" def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; +def __nvvm_ff2f16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2f16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2f16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2f16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; def __nvvm_ff2f16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; @@ -613,6 +621,19 @@ def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; +def __nvvm_f2bf16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; +def __nvvm_f2bf16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; +def __nvvm_f2bf16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; +def __nvvm_f2bf16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; + +def __nvvm_f2f16_rn : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rn_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rz : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rz_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; +def __nvvm_f2f16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; +def __nvvm_f2f16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; +def __nvvm_f2f16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>; def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX81>; diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 7a14c6ec21a1a..3742746def75f 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -512,7 +512,7 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in def vperm2f128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; } -let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermilpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">; def vpermilps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; } @@ -528,6 +528,8 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">; def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">; def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">; + def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; + def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">; foreach Op = ["hadd", "hsub"] in { def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">; @@ -536,8 +538,6 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid } let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; - def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">; def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">; def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">; def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">; @@ -2375,10 +2375,12 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128> def vcvttss2si32 : X86Builtin<"int(_Vector<4, float>, _Constant int)">; def vcvttss2usi32 : X86Builtin<"unsigned int(_Vector<4, float>, _Constant int)">; } - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpermilpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Constant int)">; def vpermilps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Constant int)">; +} + +let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def vpermilvarpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">; def vpermilvarps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3efed335720b0..f4aa20d0a25e4 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10740,8 +10740,19 @@ def warn_lifetime_safety_loan_expires_permissive : Warning< def warn_lifetime_safety_loan_expires_strict : Warning< "object whose reference is captured may not live long enough">, InGroup, DefaultIgnore; + +def warn_lifetime_safety_return_stack_addr_permissive + : Warning<"address of stack memory is returned later">, + InGroup, + DefaultIgnore; +def warn_lifetime_safety_return_stack_addr_strict + : Warning<"address of stack memory may be returned later">, + InGroup, + DefaultIgnore; + def note_lifetime_safety_used_here : Note<"later used here">; def note_lifetime_safety_destroyed_here : Note<"destroyed here">; +def note_lifetime_safety_returned_here : Note<"returned here">; // For non-floating point, expressions of the form x == x or x != x // should result in a warning, since these always evaluate to a constant. diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 8aa89d8c8c807..3f042f8ddb5a1 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -566,8 +566,8 @@ class LangOptions : public LangOptionsBase { bool AtomicFineGrainedMemory = false; bool AtomicIgnoreDenormalMode = false; - /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use - /// target default). + /// Maximum number of allocation tokens (0 = target SIZE_MAX), nullopt if none + /// set (use target SIZE_MAX). std::optional AllocTokenMax; /// The allocation token mode. diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index d346ad1efb588..97440e0cd4df0 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -2764,7 +2764,7 @@ defm sanitize_alloc_token_extended : BoolOption<"f", "sanitize-alloc-token-exten def falloc_token_max_EQ : Joined<["-"], "falloc-token-max=">, Group, Visibility<[ClangOption, CC1Option]>, MetaVarName<"">, - HelpText<"Limit to maximum N allocation tokens (0 = no max)">; + HelpText<"Limit to maximum N allocation tokens (0 = target SIZE_MAX)">; def falloc_token_mode_EQ : Joined<["-"], "falloc-token-mode=">, Group, Visibility<[CC1Option]>, diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 5a96320e12b6f..c63c2ce83c76f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1317,8 +1317,9 @@ static bool interp__builtin_infer_alloc_token(InterpState &S, CodePtr OpPC, uint64_t BitWidth = ASTCtx.getTypeSize(ASTCtx.getSizeType()); auto Mode = ASTCtx.getLangOpts().AllocTokenMode.value_or(llvm::DefaultAllocTokenMode); + auto MaxTokensOpt = ASTCtx.getLangOpts().AllocTokenMax; uint64_t MaxTokens = - ASTCtx.getLangOpts().AllocTokenMax.value_or(~0ULL >> (64 - BitWidth)); + MaxTokensOpt.value_or(0) ? *MaxTokensOpt : (~0ULL >> (64 - BitWidth)); // We do not read any of the arguments; discard them. for (int I = Call->getNumArgs() - 1; I >= 0; --I) @@ -4619,6 +4620,9 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_pshufd: case X86::BI__builtin_ia32_pshufd256: case X86::BI__builtin_ia32_pshufd512: + case X86::BI__builtin_ia32_vpermilps: + case X86::BI__builtin_ia32_vpermilps256: + case X86::BI__builtin_ia32_vpermilps512: return interp__builtin_ia32_shuffle_generic( S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { unsigned LaneBase = (DstIdx / 4) * 4; @@ -4627,6 +4631,22 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return std::make_pair(0, static_cast(LaneBase + Sel)); }); + case X86::BI__builtin_ia32_vpermilpd: + case X86::BI__builtin_ia32_vpermilpd256: + case X86::BI__builtin_ia32_vpermilpd512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned Control) { + unsigned NumElemPerLane = 2; + unsigned BitsPerElem = 1; + unsigned MaskBits = 8; + unsigned IndexMask = 0x1; + unsigned Lane = DstIdx / NumElemPerLane; + unsigned LaneOffset = Lane * NumElemPerLane; + unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; + unsigned Index = (Control >> BitIndex) & IndexMask; + return std::make_pair(0, static_cast(LaneOffset + Index)); + }); + case X86::BI__builtin_ia32_kandqi: case X86::BI__builtin_ia32_kandhi: case X86::BI__builtin_ia32_kandsi: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 74f6e3acb6b39..a9cff7f88d6f2 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13023,7 +13023,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_pshufd: case X86::BI__builtin_ia32_pshufd256: - case X86::BI__builtin_ia32_pshufd512: { + case X86::BI__builtin_ia32_pshufd512: + case X86::BI__builtin_ia32_vpermilps: + case X86::BI__builtin_ia32_vpermilps256: + case X86::BI__builtin_ia32_vpermilps512: { APValue R; if (!evalShuffleGeneric( Info, E, R, @@ -13040,6 +13043,25 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(R, E); } + case X86::BI__builtin_ia32_vpermilpd: + case X86::BI__builtin_ia32_vpermilpd256: + case X86::BI__builtin_ia32_vpermilpd512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned Control) { + unsigned NumElemPerLane = 2; + unsigned BitsPerElem = 1; + unsigned MaskBits = 8; + unsigned IndexMask = 0x1; + unsigned Lane = DstIdx / NumElemPerLane; + unsigned LaneOffset = Lane * NumElemPerLane; + unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits; + unsigned Index = (Control >> BitIndex) & IndexMask; + return std::make_pair(0, static_cast(LaneOffset + Index)); + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_phminposuw128: { APValue Source; if (!Evaluate(Source, Info, E->getArg(0))) @@ -15559,8 +15581,9 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, auto Mode = Info.getLangOpts().AllocTokenMode.value_or(llvm::DefaultAllocTokenMode); uint64_t BitWidth = Info.Ctx.getTypeSize(Info.Ctx.getSizeType()); + auto MaxTokensOpt = Info.getLangOpts().AllocTokenMax; uint64_t MaxTokens = - Info.getLangOpts().AllocTokenMax.value_or(~0ULL >> (64 - BitWidth)); + MaxTokensOpt.value_or(0) ? *MaxTokensOpt : (~0ULL >> (64 - BitWidth)); auto MaybeToken = llvm::getAllocToken(Mode, *ATMD, MaxTokens); if (!MaybeToken) return Error(E, diag::note_constexpr_infer_alloc_token_stateful_mode); diff --git a/clang/lib/Analysis/LifetimeSafety/Checker.cpp b/clang/lib/Analysis/LifetimeSafety/Checker.cpp index c443c3a5d4f9b..1f7c282dadac2 100644 --- a/clang/lib/Analysis/LifetimeSafety/Checker.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Checker.cpp @@ -43,7 +43,7 @@ namespace { /// Struct to store the complete context for a potential lifetime violation. struct PendingWarning { SourceLocation ExpiryLoc; // Where the loan expired. - const Expr *UseExpr; // Where the origin holding this loan was used. + llvm::PointerUnion CausingFact; Confidence ConfidenceLevel; }; @@ -68,7 +68,7 @@ class LifetimeChecker { issuePendingWarnings(); } - /// Checks for use-after-free errors when a loan expires. + /// Checks for use-after-free & use-after-return errors when a loan expires. /// /// This method examines all live origins at the expiry point and determines /// if any of them hold the expiring loan. If so, it creates a pending @@ -83,7 +83,11 @@ class LifetimeChecker { LoanID ExpiredLoan = EF->getLoanID(); LivenessMap Origins = LiveOrigins.getLiveOriginsAt(EF); Confidence CurConfidence = Confidence::None; - const UseFact *BadUse = nullptr; + // The UseFact or OriginEscapesFact most indicative of a lifetime error, + // prioritized by earlier source location. + llvm::PointerUnion + BestCausingFact = nullptr; + for (auto &[OID, LiveInfo] : Origins) { LoanSet HeldLoans = LoanPropagation.getLoans(OID, EF); if (!HeldLoans.contains(ExpiredLoan)) @@ -92,17 +96,17 @@ class LifetimeChecker { Confidence NewConfidence = livenessKindToConfidence(LiveInfo.Kind); if (CurConfidence < NewConfidence) { CurConfidence = NewConfidence; - BadUse = LiveInfo.CausingUseFact; + BestCausingFact = LiveInfo.CausingFact; } } - if (!BadUse) + if (!BestCausingFact) return; // We have a use-after-free. Confidence LastConf = FinalWarningsMap.lookup(ExpiredLoan).ConfidenceLevel; if (LastConf >= CurConfidence) return; FinalWarningsMap[ExpiredLoan] = {/*ExpiryLoc=*/EF->getExpiryLoc(), - /*UseExpr=*/BadUse->getUseExpr(), + /*BestCausingFact=*/BestCausingFact, /*ConfidenceLevel=*/CurConfidence}; } @@ -112,8 +116,20 @@ class LifetimeChecker { for (const auto &[LID, Warning] : FinalWarningsMap) { const Loan &L = FactMgr.getLoanMgr().getLoan(LID); const Expr *IssueExpr = L.IssueExpr; - Reporter->reportUseAfterFree(IssueExpr, Warning.UseExpr, - Warning.ExpiryLoc, Warning.ConfidenceLevel); + llvm::PointerUnion + CausingFact = Warning.CausingFact; + Confidence Confidence = Warning.ConfidenceLevel; + SourceLocation ExpiryLoc = Warning.ExpiryLoc; + + if (const auto *UF = CausingFact.dyn_cast()) + Reporter->reportUseAfterFree(IssueExpr, UF->getUseExpr(), ExpiryLoc, + Confidence); + else if (const auto *OEF = + CausingFact.dyn_cast()) + Reporter->reportUseAfterReturn(IssueExpr, OEF->getEscapeExpr(), + ExpiryLoc, Confidence); + else + llvm_unreachable("Unhandled CausingFact type"); } } }; diff --git a/clang/lib/Analysis/LifetimeSafety/Dataflow.h b/clang/lib/Analysis/LifetimeSafety/Dataflow.h index de821bb17eb6b..05c20d6385368 100644 --- a/clang/lib/Analysis/LifetimeSafety/Dataflow.h +++ b/clang/lib/Analysis/LifetimeSafety/Dataflow.h @@ -170,8 +170,8 @@ class DataflowAnalysis { return D->transfer(In, *F->getAs()); case Fact::Kind::OriginFlow: return D->transfer(In, *F->getAs()); - case Fact::Kind::ReturnOfOrigin: - return D->transfer(In, *F->getAs()); + case Fact::Kind::OriginEscapes: + return D->transfer(In, *F->getAs()); case Fact::Kind::Use: return D->transfer(In, *F->getAs()); case Fact::Kind::TestPoint: @@ -184,7 +184,7 @@ class DataflowAnalysis { Lattice transfer(Lattice In, const IssueFact &) { return In; } Lattice transfer(Lattice In, const ExpireFact &) { return In; } Lattice transfer(Lattice In, const OriginFlowFact &) { return In; } - Lattice transfer(Lattice In, const ReturnOfOriginFact &) { return In; } + Lattice transfer(Lattice In, const OriginEscapesFact &) { return In; } Lattice transfer(Lattice In, const UseFact &) { return In; } Lattice transfer(Lattice In, const TestPointFact &) { return In; } }; diff --git a/clang/lib/Analysis/LifetimeSafety/Facts.cpp b/clang/lib/Analysis/LifetimeSafety/Facts.cpp index 190c038f46401..0ae7111c489e8 100644 --- a/clang/lib/Analysis/LifetimeSafety/Facts.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Facts.cpp @@ -43,10 +43,10 @@ void OriginFlowFact::dump(llvm::raw_ostream &OS, const LoanManager &, OS << ")\n"; } -void ReturnOfOriginFact::dump(llvm::raw_ostream &OS, const LoanManager &, - const OriginManager &OM) const { - OS << "ReturnOfOrigin ("; - OM.dump(getReturnedOriginID(), OS); +void OriginEscapesFact::dump(llvm::raw_ostream &OS, const LoanManager &, + const OriginManager &OM) const { + OS << "OriginEscapes ("; + OM.dump(getEscapedOriginID(), OS); OS << ")\n"; } @@ -95,4 +95,14 @@ void FactManager::dump(const CFG &Cfg, AnalysisDeclContext &AC) const { } } +llvm::ArrayRef +FactManager::getBlockContaining(ProgramPoint P) const { + for (const auto &BlockToFactsVec : BlockToFacts) { + for (const Fact *F : BlockToFactsVec) + if (F == P) + return BlockToFactsVec; + } + return {}; +} + } // namespace clang::lifetimes::internal diff --git a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp index 381ff99aae420..cb9a202b08968 100644 --- a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp +++ b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp @@ -58,6 +58,7 @@ void FactsGenerator::run() { // initializations and destructions are processed in the correct sequence. for (const CFGBlock *Block : *AC.getAnalysis()) { CurrentBlockFacts.clear(); + EscapesInCurrentBlock.clear(); for (unsigned I = 0; I < Block->size(); ++I) { const CFGElement &Element = Block->Elements[I]; if (std::optional CS = Element.getAs()) @@ -66,6 +67,8 @@ void FactsGenerator::run() { Element.getAs()) handleDestructor(*DtorOpt); } + CurrentBlockFacts.append(EscapesInCurrentBlock.begin(), + EscapesInCurrentBlock.end()); FactMgr.addBlockFacts(Block, CurrentBlockFacts); } } @@ -166,7 +169,8 @@ void FactsGenerator::VisitReturnStmt(const ReturnStmt *RS) { if (const Expr *RetExpr = RS->getRetValue()) { if (hasOrigin(RetExpr)) { OriginID OID = FactMgr.getOriginMgr().getOrCreate(*RetExpr); - CurrentBlockFacts.push_back(FactMgr.createFact(OID)); + EscapesInCurrentBlock.push_back( + FactMgr.createFact(OID, RetExpr)); } } } diff --git a/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp b/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp index 59f594e50fb46..57338122b4440 100644 --- a/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LiveOrigins.cpp @@ -53,6 +53,14 @@ struct Lattice { } }; +static SourceLocation GetFactLoc(CausingFactType F) { + if (const auto *UF = F.dyn_cast()) + return UF->getUseExpr()->getExprLoc(); + if (const auto *OEF = F.dyn_cast()) + return OEF->getEscapeExpr()->getExprLoc(); + llvm_unreachable("unhandled causing fact in PointerUnion"); +} + /// The analysis that tracks which origins are live, with granular information /// about the causing use fact and confidence level. This is a backward /// analysis. @@ -74,11 +82,14 @@ class AnalysisImpl /// one. Lattice join(Lattice L1, Lattice L2) const { LivenessMap Merged = L1.LiveOrigins; - // Take the earliest UseFact to make the join hermetic and commutative. - auto CombineUseFact = [](const UseFact &A, - const UseFact &B) -> const UseFact * { - return A.getUseExpr()->getExprLoc() < B.getUseExpr()->getExprLoc() ? &A - : &B; + // Take the earliest Fact to make the join hermetic and commutative. + auto CombineCausingFact = [](CausingFactType A, + CausingFactType B) -> CausingFactType { + if (!A) + return B; + if (!B) + return A; + return GetFactLoc(A) < GetFactLoc(B) ? A : B; }; auto CombineLivenessKind = [](LivenessKind K1, LivenessKind K2) -> LivenessKind { @@ -93,12 +104,11 @@ class AnalysisImpl const LivenessInfo *L2) -> LivenessInfo { assert((L1 || L2) && "unexpectedly merging 2 empty sets"); if (!L1) - return LivenessInfo(L2->CausingUseFact, LivenessKind::Maybe); + return LivenessInfo(L2->CausingFact, LivenessKind::Maybe); if (!L2) - return LivenessInfo(L1->CausingUseFact, LivenessKind::Maybe); - return LivenessInfo( - CombineUseFact(*L1->CausingUseFact, *L2->CausingUseFact), - CombineLivenessKind(L1->Kind, L2->Kind)); + return LivenessInfo(L1->CausingFact, LivenessKind::Maybe); + return LivenessInfo(CombineCausingFact(L1->CausingFact, L2->CausingFact), + CombineLivenessKind(L1->Kind, L2->Kind)); }; return Lattice(utils::join( L1.LiveOrigins, L2.LiveOrigins, Factory, CombineLivenessInfo, @@ -120,6 +130,14 @@ class AnalysisImpl LivenessInfo(&UF, LivenessKind::Must))); } + /// An escaping origin (e.g., via return) makes the origin live with definite + /// confidence, as it dominates this program point. + Lattice transfer(Lattice In, const OriginEscapesFact &OEF) { + OriginID OID = OEF.getEscapedOriginID(); + return Lattice(Factory.add(In.LiveOrigins, OID, + LivenessInfo(&OEF, LivenessKind::Must))); + } + /// Issuing a new loan to an origin kills its liveness. Lattice transfer(Lattice In, const IssueFact &IF) { return Lattice(Factory.remove(In.LiveOrigins, IF.getOriginID())); diff --git a/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp b/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp index 0e6c194123df8..23ce1b78dfde2 100644 --- a/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LoanPropagation.cpp @@ -58,12 +58,10 @@ static llvm::BitVector computePersistentOrigins(const FactManager &FactMgr, CheckOrigin(OF->getSrcOriginID()); break; } - case Fact::Kind::ReturnOfOrigin: - CheckOrigin(F->getAs()->getReturnedOriginID()); - break; case Fact::Kind::Use: CheckOrigin(F->getAs()->getUsedOrigin()); break; + case Fact::Kind::OriginEscapes: case Fact::Kind::Expire: case Fact::Kind::TestPoint: break; diff --git a/clang/lib/Basic/Targets/Sparc.cpp b/clang/lib/Basic/Targets/Sparc.cpp index d1a891092b0f5..d47eecb3cf058 100644 --- a/clang/lib/Basic/Targets/Sparc.cpp +++ b/clang/lib/Basic/Targets/Sparc.cpp @@ -19,6 +19,7 @@ using namespace clang; using namespace clang::targets; const char *const SparcTargetInfo::GCCRegNames[] = { + // clang-format off // Integer registers "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", @@ -30,6 +31,10 @@ const char *const SparcTargetInfo::GCCRegNames[] = { "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", "f32", "f34", "f36", "f38", "f40", "f42", "f44", "f46", "f48", "f50", "f52", "f54", "f56", "f58", "f60", "f62", + + // Condition code registers + "icc", "fcc0", "fcc1", "fcc2", "fcc3", + // clang-format on }; ArrayRef SparcTargetInfo::getGCCRegNames() const { diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp index 74a489f4b3ac9..11ab2cfaac17a 100644 --- a/clang/lib/Interpreter/IncrementalExecutor.cpp +++ b/clang/lib/Interpreter/IncrementalExecutor.cpp @@ -19,7 +19,6 @@ #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h" -#include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" @@ -47,8 +46,7 @@ // Force linking some of the runtimes that helps attaching to a debugger. LLVM_ATTRIBUTE_USED void linkComponents() { - llvm::errs() << (void *)&llvm_orc_registerJITLoaderGDBWrapper - << (void *)&llvm_orc_registerJITLoaderGDBAllocAction; + llvm::errs() << (void *)&llvm_orc_registerJITLoaderGDBAllocAction; } namespace clang { diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index d58eac19033ed..a9993dcb83f28 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2861,6 +2861,18 @@ class LifetimeSafetyReporterImpl : public LifetimeSafetyReporter { << UseExpr->getEndLoc(); } + void reportUseAfterReturn(const Expr *IssueExpr, const Expr *EscapeExpr, + SourceLocation ExpiryLoc, Confidence C) override { + S.Diag(IssueExpr->getExprLoc(), + C == Confidence::Definite + ? diag::warn_lifetime_safety_return_stack_addr_permissive + : diag::warn_lifetime_safety_return_stack_addr_strict) + << IssueExpr->getEndLoc(); + + S.Diag(EscapeExpr->getExprLoc(), diag::note_lifetime_safety_returned_here) + << EscapeExpr->getEndLoc(); + } + private: Sema &S; }; diff --git a/clang/test/CodeGen/Sparc/inline-asm-gcc-regs.c b/clang/test/CodeGen/Sparc/inline-asm-gcc-regs.c new file mode 100644 index 0000000000000..cf83b9afda4b2 --- /dev/null +++ b/clang/test/CodeGen/Sparc/inline-asm-gcc-regs.c @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm %s -o - | FileCheck %s + +// CHECK-LABEL: @icc +// CHECK: call void asm sideeffect "nop", "~{icc}"() +void icc() { + __asm__ __volatile__("nop" ::: "icc"); +} + +// CHECK-LABEL: @fcc +// CHECK: call void asm sideeffect "nop", "~{fcc0},~{fcc1},~{fcc2},~{fcc3}"() +void fcc() { + __asm__ __volatile__("nop" ::: "fcc0", "fcc1", "fcc2", "fcc3"); +} diff --git a/clang/test/CodeGen/sparc-arguments.c b/clang/test/CodeGen/Sparc/sparc-arguments.c similarity index 100% rename from clang/test/CodeGen/sparc-arguments.c rename to clang/test/CodeGen/Sparc/sparc-arguments.c diff --git a/clang/test/CodeGen/sparc-vaarg.c b/clang/test/CodeGen/Sparc/sparc-vaarg.c similarity index 100% rename from clang/test/CodeGen/sparc-vaarg.c rename to clang/test/CodeGen/Sparc/sparc-vaarg.c diff --git a/clang/test/CodeGen/sparcv8-abi.c b/clang/test/CodeGen/Sparc/sparcv8-abi.c similarity index 100% rename from clang/test/CodeGen/sparcv8-abi.c rename to clang/test/CodeGen/Sparc/sparcv8-abi.c diff --git a/clang/test/CodeGen/sparcv8-inline-asm.c b/clang/test/CodeGen/Sparc/sparcv8-inline-asm.c similarity index 100% rename from clang/test/CodeGen/sparcv8-inline-asm.c rename to clang/test/CodeGen/Sparc/sparcv8-inline-asm.c diff --git a/clang/test/CodeGen/sparcv9-abi.c b/clang/test/CodeGen/Sparc/sparcv9-abi.c similarity index 100% rename from clang/test/CodeGen/sparcv9-abi.c rename to clang/test/CodeGen/Sparc/sparcv9-abi.c diff --git a/clang/test/CodeGen/sparcv9-class-return.cpp b/clang/test/CodeGen/Sparc/sparcv9-class-return.cpp similarity index 100% rename from clang/test/CodeGen/sparcv9-class-return.cpp rename to clang/test/CodeGen/Sparc/sparcv9-class-return.cpp diff --git a/clang/test/CodeGen/sparcv9-dwarf.c b/clang/test/CodeGen/Sparc/sparcv9-dwarf.c similarity index 100% rename from clang/test/CodeGen/sparcv9-dwarf.c rename to clang/test/CodeGen/Sparc/sparcv9-dwarf.c diff --git a/clang/test/CodeGen/sparcv9-inline-asm.c b/clang/test/CodeGen/Sparc/sparcv9-inline-asm.c similarity index 100% rename from clang/test/CodeGen/sparcv9-inline-asm.c rename to clang/test/CodeGen/Sparc/sparcv9-inline-asm.c diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 46bc28b85d8db..f8931e7e55410 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1400,18 +1400,21 @@ __m128d test_mm_permute_pd(__m128d A) { // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> return _mm_permute_pd(A, 1); } +TEST_CONSTEXPR(match_m128d(_mm_permute_pd(((__m128d){1.0, 2.0}), 1), 2.0, 1.0)); __m256d test_mm256_permute_pd(__m256d A) { // CHECK-LABEL: test_mm256_permute_pd // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <4 x i32> return _mm256_permute_pd(A, 5); } +TEST_CONSTEXPR(match_m256d(_mm256_permute_pd(((__m256d){1.0f, 2.0f, 3.0f, 4.0f}), 5), 2.0f, 1.0f, 4.0f, 3.0f)); __m128 test_mm_permute_ps(__m128 A) { // CHECK-LABEL: test_mm_permute_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> return _mm_permute_ps(A, 0x1b); } +TEST_CONSTEXPR(match_m128(_mm_permute_ps(((__m128){1.0, 2.0, 3.0, 4.0}), 0x1b), 4.0, 3.0, 2.0, 1.0)); // Test case for PR12401 __m128 test2_mm_permute_ps(__m128 a) { @@ -1419,12 +1422,14 @@ __m128 test2_mm_permute_ps(__m128 a) { // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> return _mm_permute_ps(a, 0xe6); } +TEST_CONSTEXPR(match_m128(_mm_permute_ps(((__m128){1.0, 2.0, 3.0, 4.0}), 0xe6), 3.0, 2.0, 3.0, 4.0)); __m256 test_mm256_permute_ps(__m256 A) { // CHECK-LABEL: test_mm256_permute_ps // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <8 x i32> return _mm256_permute_ps(A, 0x1b); } +TEST_CONSTEXPR(match_m256(_mm256_permute_ps(((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), 0x1b), 4.0, 3.0, 2.0, 1.0, 8.0, 7.0, 6.0, 5.0)); __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) { // CHECK-LABEL: test_mm256_permute2f128_pd diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index eb25aa538e9a3..e4a9d9cb3781d 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -5516,6 +5516,7 @@ __m512d test_mm512_permute_pd(__m512d __X) { // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <8 x i32> return _mm512_permute_pd(__X, 2); } +TEST_CONSTEXPR(match_m512d(_mm512_permute_pd(((__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), 2), 0.0, 1.0, 2.0, 2.0, 4.0, 4.0, 6.0, 6.0)); __m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) { // CHECK-LABEL: test_mm512_mask_permute_pd @@ -5523,6 +5524,13 @@ __m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) { // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_permute_pd(__W, __U, __X, 2); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_permute_pd( + ((__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), + (__mmask8)0b01010100, + ((__m512d){8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}), + 2), + 0.0, 1.0, 10.0, 3.0, 12.0, 5.0, 14.0, 7.0 +)); __m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) { // CHECK-LABEL: test_mm512_maskz_permute_pd @@ -5530,12 +5538,23 @@ __m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) { // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_permute_pd(__U, __X, 2); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_permute_pd( + (__mmask8)0b01010100, + ((__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), + 2), + 0.0, 0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0 +)); __m512 test_mm512_permute_ps(__m512 __X) { // CHECK-LABEL: test_mm512_permute_ps // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <16 x i32> return _mm512_permute_ps(__X, 2); } +TEST_CONSTEXPR(match_m512(_mm512_permute_ps( + ((__m512){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + 2), + 2, 0, 0, 0, 6, 4, 4, 4, 10, 8, 8, 8, 14, 12, 12, 12 +)); __m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) { // CHECK-LABEL: test_mm512_mask_permute_ps @@ -5543,6 +5562,13 @@ __m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) { // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_permute_ps(__W, __U, __X, 2); } +TEST_CONSTEXPR(match_m512(_mm512_mask_permute_ps( + ((__m512){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + (__mmask16)0b1010101010101010, + ((__m512){16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), + 2), + 0, 16, 2, 16, 4, 20, 6, 20, 8, 24, 10, 24, 12, 28, 14, 28 +)); __m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) { // CHECK-LABEL: test_mm512_maskz_permute_ps @@ -5550,6 +5576,12 @@ __m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) { // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_permute_ps(__U, __X, 2); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_permute_ps( + (__mmask16)0b1010101010101010, + ((__m512){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + 2), + 0, 0, 0, 0, 0, 4, 0, 4, 0, 8, 0, 8, 0, 12, 0, 12 +)); __m512d test_mm512_permutevar_pd(__m512d __A, __m512i __C) { // CHECK-LABEL: test_mm512_permutevar_pd diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index e05b1ddf7b69a..69adc75c80f1c 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -8022,6 +8022,13 @@ __m128d test_mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X) { // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_permute_pd(__W, __U, __X, 1); } +TEST_CONSTEXPR(match_m128d(_mm_mask_permute_pd( + ((__m128d){0.0, 1.0}), + (__mmask8)0b10, + ((__m128d){2.0, 3.0}), + 1), + 0.0, 2.0 +)); __m128d test_mm_maskz_permute_pd(__mmask8 __U, __m128d __X) { // CHECK-LABEL: test_mm_maskz_permute_pd @@ -8029,6 +8036,12 @@ __m128d test_mm_maskz_permute_pd(__mmask8 __U, __m128d __X) { // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_permute_pd(__U, __X, 1); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_permute_pd( + (__mmask8)0b10, + ((__m128d){1.0, 2.0}), + 1), + 0.0, 1.0 +)); __m256d test_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X) { // CHECK-LABEL: test_mm256_mask_permute_pd @@ -8036,6 +8049,13 @@ __m256d test_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X) { // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_permute_pd(__W, __U, __X, 5); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_permute_pd( + ((__m256d){0.0, 1.0, 2.0, 3.0}), + (__mmask8)0b1010, + ((__m256d){4.0, 5.0, 6.0, 7.0}), + 5), + 0.0, 4.0, 2.0, 6.0 +)); __m256d test_mm256_maskz_permute_pd(__mmask8 __U, __m256d __X) { // CHECK-LABEL: test_mm256_maskz_permute_pd @@ -8043,6 +8063,12 @@ __m256d test_mm256_maskz_permute_pd(__mmask8 __U, __m256d __X) { // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_permute_pd(__U, __X, 5); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_permute_pd( + (__mmask8)0b1010, + ((__m256d){4.0, 5.0, 6.0, 7.0}), + 5), + 0.0, 4.0, 0.0, 6.0 +)); __m128 test_mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X) { // CHECK-LABEL: test_mm_mask_permute_ps @@ -8050,6 +8076,13 @@ __m128 test_mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X) { // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_permute_ps(__W, __U, __X, 0x1b); } +TEST_CONSTEXPR(match_m128(_mm_mask_permute_ps( + ((__m128){0.0, 1.0, 2.0, 3.0}), + (__mmask8)0b1010, + ((__m128){4.0, 5.0, 6.0, 7.0}), + 0x1b), + 0, 6.0, 2.0, 4.0 +)); __m128 test_mm_maskz_permute_ps(__mmask8 __U, __m128 __X) { // CHECK-LABEL: test_mm_maskz_permute_ps @@ -8057,6 +8090,13 @@ __m128 test_mm_maskz_permute_ps(__mmask8 __U, __m128 __X) { // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_permute_ps(__U, __X, 0x1b); } +TEST_CONSTEXPR(match_m128(_mm_maskz_permute_ps( + (__mmask8)0b1010, + ((__m128){4.0, 5.0, 6.0, 7.0}), + 0x1b), + 0.0, 6.0, 0.0, 4.0 +)); + __m256 test_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X) { // CHECK-LABEL: test_mm256_mask_permute_ps @@ -8064,6 +8104,13 @@ __m256 test_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X) { // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_permute_ps(__W, __U, __X, 0x1b); } +TEST_CONSTEXPR(match_m256(_mm256_mask_permute_ps( + ((__m256){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), + (__mmask8)0b10101010, + ((__m256){8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}), + 0x1b), + 0.0, 10.0, 2.0, 8.0, 4.0, 14.0, 6.0, 12.0 +)); __m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) { // CHECK-LABEL: test_mm256_maskz_permute_ps @@ -8071,6 +8118,12 @@ __m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) { // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_permute_ps(__U, __X, 0x1b); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_permute_ps( + (__mmask8)0b10101010, + ((__m256){8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}), + 0x1b), + 0.0, 10.0, 0.0, 8.0, 0.0, 14.0, 0.0, 12.0 +)); __m128d test_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) { // CHECK-LABEL: test_mm_mask_permutevar_pd diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index c0ed799970122..75f2588f4837b 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1007,6 +1007,16 @@ __device__ void nvvm_cvt_sm80() { __nvvm_ff2bf16x2_rz(1, 1); // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00) __nvvm_ff2bf16x2_rz_relu(1, 1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rn_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rn_relu_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rz_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rz_relu_satfinite(1, 1); + #endif // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00) __nvvm_ff2f16x2_rn(1, 1); @@ -1016,6 +1026,16 @@ __device__ void nvvm_cvt_sm80() { __nvvm_ff2f16x2_rz(1, 1); // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00) __nvvm_ff2f16x2_rz_relu(1, 1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rn_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rn_relu_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rz_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rz_relu_satfinite(1, 1); + #endif // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00) __nvvm_f2bf16_rn(1); @@ -1025,6 +1045,35 @@ __device__ void nvvm_cvt_sm80() { __nvvm_f2bf16_rz(1); // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00) __nvvm_f2bf16_rz_relu(1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rn_satfinite(1); + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rn_relu_satfinite(1); + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rz_satfinite(1); + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rz_relu_satfinite(1); + #endif + + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn(float 1.000000e+00) + __nvvm_f2f16_rn(1); + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn.relu(float 1.000000e+00) + __nvvm_f2f16_rn_relu(1); + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz(float 1.000000e+00) + __nvvm_f2f16_rz(1); + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz.relu(float 1.000000e+00) + __nvvm_f2f16_rz_relu(1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.satfinite(float 1.000000e+00) + __nvvm_f2f16_rn_satfinite(1); + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.relu.satfinite(float 1.000000e+00) + __nvvm_f2f16_rn_relu_satfinite(1); + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.satfinite(float 1.000000e+00) + __nvvm_f2f16_rz_satfinite(1); + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.relu.satfinite(float 1.000000e+00) + __nvvm_f2f16_rz_relu_satfinite(1); + #endif // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00) __nvvm_f2tf32_rna(1); diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp index e9515b5d61006..11d3b836db3e7 100644 --- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp +++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp @@ -18,8 +18,8 @@ MyObj* return_local_addr() { return p; // CHECK: Use ([[O_P]] (Decl: p), Read) // CHECK: OriginFlow (Dest: [[O_RET_VAL:[0-9]+]] (Expr: ImplicitCastExpr), Src: [[O_P]] (Decl: p)) -// CHECK: ReturnOfOrigin ([[O_RET_VAL]] (Expr: ImplicitCastExpr)) // CHECK: Expire ([[L_X]] (Path: x)) +// CHECK: OriginEscapes ([[O_RET_VAL]] (Expr: ImplicitCastExpr)) } @@ -49,8 +49,8 @@ MyObj* assign_and_return_local_addr() { return ptr2; // CHECK: Use ([[O_PTR2]] (Decl: ptr2), Read) // CHECK: OriginFlow (Dest: [[O_RET_VAL:[0-9]+]] (Expr: ImplicitCastExpr), Src: [[O_PTR2]] (Decl: ptr2)) -// CHECK: ReturnOfOrigin ([[O_RET_VAL]] (Expr: ImplicitCastExpr)) // CHECK: Expire ([[L_Y]] (Path: y)) +// CHECK: OriginEscapes ([[O_RET_VAL]] (Expr: ImplicitCastExpr)) } // Return of Non-Pointer Type diff --git a/clang/test/Sema/warn-lifetime-safety.cpp b/clang/test/Sema/warn-lifetime-safety.cpp index b9368db550805..2803e73b5aee2 100644 --- a/clang/test/Sema/warn-lifetime-safety.cpp +++ b/clang/test/Sema/warn-lifetime-safety.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -fexperimental-lifetime-safety -Wexperimental-lifetime-safety -verify %s +// RUN: %clang_cc1 -fsyntax-only -fexperimental-lifetime-safety -Wexperimental-lifetime-safety -Wno-dangling -verify %s struct MyObj { int id; @@ -396,6 +396,131 @@ void loan_from_previous_iteration(MyObj safe, bool condition) { } // expected-note {{destroyed here}} } +//===----------------------------------------------------------------------===// +// Basic Definite Use-After-Return (Return-Stack-Address) (-W...permissive) +// These are cases where the pointer is guaranteed to be dangling at the use site. +//===----------------------------------------------------------------------===// + +MyObj* simple_return_stack_address() { + MyObj s; + MyObj* p = &s; // expected-warning {{address of stack memory is returned later}} + return p; // expected-note {{returned here}} +} + +MyObj* direct_return() { + MyObj s; + return &s; // expected-warning {{address of stack memory is returned later}} + // expected-note@-1 {{returned here}} +} + +const MyObj* conditional_assign_unconditional_return(const MyObj& safe, bool c) { + MyObj s; + const MyObj* p = &safe; + if (c) { + p = &s; // expected-warning {{address of stack memory is returned later}} + } + return p; // expected-note {{returned here}} +} + +View conditional_assign_both_branches(const MyObj& safe, bool c) { + MyObj s; + View p; + if (c) { + p = s; // expected-warning {{address of stack memory is returned later}} + } + else { + p = safe; + } + return p; // expected-note {{returned here}} + +} + +View reassign_safe_to_local(const MyObj& safe) { + MyObj local; + View p = safe; + p = local; // expected-warning {{address of stack memory is returned later}} + return p; // expected-note {{returned here}} +} + +View pointer_chain_to_local() { + MyObj local; + View p1 = local; // expected-warning {{address of stack memory is returned later}} + View p2 = p1; + return p2; // expected-note {{returned here}} +} + +View multiple_assign_multiple_return(const MyObj& safe, bool c1, bool c2) { + MyObj local1; + MyObj local2; + View p; + if (c1) { + p = local1; // expected-warning {{address of stack memory is returned later}} + return p; // expected-note {{returned here}} + } + else if (c2) { + p = local2; // expected-warning {{address of stack memory is returned later}} + return p; // expected-note {{returned here}} + } + p = safe; + return p; +} + +View multiple_assign_single_return(const MyObj& safe, bool c1, bool c2) { + MyObj local1; + MyObj local2; + View p; + if (c1) { + p = local1; // expected-warning {{address of stack memory is returned later}} + } + else if (c2) { + p = local2; // expected-warning {{address of stack memory is returned later}} + } + else { + p = safe; + } + return p; // expected-note 2 {{returned here}} +} + +View direct_return_of_local() { + MyObj stack; + return stack; // expected-warning {{address of stack memory is returned later}} + // expected-note@-1 {{returned here}} +} + +MyObj& reference_return_of_local() { + MyObj stack; + return stack; // expected-warning {{address of stack memory is returned later}} + // expected-note@-1 {{returned here}} +} + +//===----------------------------------------------------------------------===// +// Use-After-Scope & Use-After-Return (Return-Stack-Address) Combined +// These are cases where the diagnostic kind is determined by location +//===----------------------------------------------------------------------===// + +MyObj* uaf_before_uar() { + MyObj* p; + { + MyObj local_obj; + p = &local_obj; // expected-warning {{object whose reference is captured does not live long enough}} + } // expected-note {{destroyed here}} + return p; // expected-note {{later used here}} +} + +View uar_before_uaf(const MyObj& safe, bool c) { + View p; + { + MyObj local_obj; + p = local_obj; // expected-warning {{address of stack memory is returned later}} + if (c) { + return p; // expected-note {{returned here}} + } + } + p.use(); + p = safe; + return p; +} + //===----------------------------------------------------------------------===// // No-Error Cases //===----------------------------------------------------------------------===// @@ -434,12 +559,19 @@ void no_error_loan_from_current_iteration(bool cond) { } } +View safe_return(const MyObj& safe) { + MyObj local; + View p = local; + p = safe; // p has been reassigned + return p; // This is safe +} //===----------------------------------------------------------------------===// // Lifetimebound Attribute Tests //===----------------------------------------------------------------------===// View Identity(View v [[clang::lifetimebound]]); +const MyObj& IdentityRef(const MyObj& obj [[clang::lifetimebound]]); MyObj* Identity(MyObj* v [[clang::lifetimebound]]); View Choose(bool cond, View a [[clang::lifetimebound]], View b [[clang::lifetimebound]]); MyObj* GetPointer(const MyObj& obj [[clang::lifetimebound]]); @@ -584,6 +716,28 @@ void lifetimebound_ctor() { (void)v; } +View lifetimebound_return_of_local() { + MyObj stack; + return Identity(stack); // expected-warning {{address of stack memory is returned later}} + // expected-note@-1 {{returned here}} +} + +const MyObj& lifetimebound_return_ref_to_local() { + MyObj stack; + return IdentityRef(stack); // expected-warning {{address of stack memory is returned later}} + // expected-note@-1 {{returned here}} +} + +// FIXME: Fails to diagnose UAR when a reference to a by-value param escapes via the return value. +View lifetimebound_return_of_by_value_param(MyObj stack_param) { + return Identity(stack_param); +} + +// FIXME: Fails to diagnose UAF when a reference to a by-value param escapes via an out-param. +void uaf_from_by_value_param_failing(MyObj param, View* out_p) { + *out_p = Identity(param); +} + // Conditional operator. void conditional_operator_one_unsafe_branch(bool cond) { MyObj safe; diff --git a/clang/test/SemaCXX/alloc-token.cpp b/clang/test/SemaCXX/alloc-token.cpp index be7acb7d42ef2..518ad7d94eb96 100644 --- a/clang/test/SemaCXX/alloc-token.cpp +++ b/clang/test/SemaCXX/alloc-token.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s -falloc-token-max=0 // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s -falloc-token-mode=typehash -DMODE_TYPEHASH // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s -falloc-token-max=2 -DTOKEN_MAX=2 diff --git a/clang/unittests/Analysis/LifetimeSafetyTest.cpp b/clang/unittests/Analysis/LifetimeSafetyTest.cpp index 601308c53f9a9..558a22af72572 100644 --- a/clang/unittests/Analysis/LifetimeSafetyTest.cpp +++ b/clang/unittests/Analysis/LifetimeSafetyTest.cpp @@ -20,6 +20,7 @@ namespace clang::lifetimes::internal { namespace { using namespace ast_matchers; +using ::testing::Not; using ::testing::SizeIs; using ::testing::UnorderedElementsAreArray; @@ -122,6 +123,40 @@ class LifetimeTestHelper { return LID; } + // Gets the set of loans that are live at the given program point. A loan is + // considered live at point P if there is a live origin which contains this + // loan. + std::optional getLiveLoansAtPoint(ProgramPoint P) const { + const auto &LiveOriginsAnalysis = Runner.getAnalysis().getLiveOrigins(); + const auto &LoanPropagation = Runner.getAnalysis().getLoanPropagation(); + + LivenessMap LiveOriginsMap = LiveOriginsAnalysis.getLiveOriginsAt(P); + + LoanSet::Factory F; + LoanSet Result = F.getEmptySet(); + + for (const auto &[OID, LI] : LiveOriginsMap) { + LoanSet Loans = LoanPropagation.getLoans(OID, P); + Result = clang::lifetimes::internal::utils::join(Result, Loans, F); + } + + if (Result.isEmpty()) + return std::nullopt; + + return Result; + } + + const ExpireFact * + getExpireFactFromAllFacts(const llvm::ArrayRef &FactsInBlock, + const LoanID &loanID) { + for (const Fact *F : FactsInBlock) { + if (auto const *CurrentEF = F->getAs()) + if (CurrentEF->getLoanID() == loanID) + return CurrentEF; + } + return nullptr; + } + std::optional getLoansAtPoint(OriginID OID, llvm::StringRef Annotation) { ProgramPoint PP = Runner.getProgramPoint(Annotation); @@ -141,6 +176,14 @@ class LifetimeTestHelper { return Result; } + ProgramPoint getProgramPoint(llvm::StringRef Annotation) { + return Runner.getProgramPoint(Annotation); + } + + llvm::ArrayRef getBlockContaining(ProgramPoint P) { + return Runner.getAnalysis().getFactManager().getBlockContaining(P); + } + private: template DeclT *findDecl(llvm::StringRef Name) { auto &Ctx = Runner.getASTContext(); @@ -304,6 +347,43 @@ MATCHER_P2(AreLiveAtImpl, Annotation, ConfFilter, "") { return true; } +MATCHER_P2(HasLiveLoanAtExpiryImpl, HelperPtr, Annotation, "") { + llvm::StringRef VarName = arg; + LifetimeTestHelper &Helper = *HelperPtr; + + std::vector Loans = Helper.getLoansForVar(VarName); + if (Loans.empty()) { + *result_listener << "No loans found for variable" << VarName.str(); + return false; + } + + ProgramPoint PP = Helper.getProgramPoint(Annotation); + llvm::ArrayRef AllFactsInBlock = Helper.getBlockContaining(PP); + + bool NoExpireFactLive = false; + for (const LoanID CurrentLoanID : Loans) { + const ExpireFact *EF = + Helper.getExpireFactFromAllFacts(AllFactsInBlock, CurrentLoanID); + if (!EF) { + NoExpireFactLive = true; + continue; + } + std::optional LiveLoans = Helper.getLiveLoansAtPoint(EF); + if (!LiveLoans.has_value()) { + *result_listener << "No Live Loans At Expiry Location."; + continue; + } + if (LiveLoans->contains({CurrentLoanID})) + return true; + } + if (NoExpireFactLive) { + *result_listener << "No Expire Fact for loan of " << VarName.str(); + return false; + } + *result_listener << "No loans of " << VarName.str() << " are live"; + return false; +} + MATCHER_P(MustBeLiveAt, Annotation, "") { return ExplainMatchResult(AreLiveAtImpl(Annotation, LivenessKindFilter::Must), arg, result_listener); @@ -353,6 +433,10 @@ class LifetimeAnalysisTest : public ::testing::Test { return HasLoansToImpl(std::vector(LoanVars), Annotation); } + auto HasLiveLoanAtExpiry(const char *Annotation) { + return HasLiveLoanAtExpiryImpl(Helper.get(), Annotation); + } + std::unique_ptr Runner; std::unique_ptr Helper; }; @@ -1223,5 +1307,204 @@ TEST_F(LifetimeAnalysisTest, LivenessOutsideLoop) { EXPECT_THAT(Origins({"p"}), MaybeLiveAt("p1")); } +TEST_F(LifetimeAnalysisTest, SimpleReturnStackAddress) { + SetupTest(R"( + MyObj* target() { + MyObj s; + MyObj* p = &s; + POINT(p1); + return p; + } + )"); + EXPECT_THAT("s", HasLiveLoanAtExpiry("p1")); +} + +TEST_F(LifetimeAnalysisTest, DirectReturn) { + SetupTest(R"( + MyObj* target() { + MyObj s; + POINT(P); + return &s; + } + )"); + EXPECT_THAT("s", HasLiveLoanAtExpiry("P")); +} + +TEST_F(LifetimeAnalysisTest, ConditionalAssignUnconditionalReturn) { + SetupTest(R"( + MyObj* target(bool c) { + MyObj s1; + MyObj* p = nullptr; + if (c) { + p = &s1; + } + POINT(P); + return p; + } + )"); + EXPECT_THAT("s1", HasLiveLoanAtExpiry("P")); +} + +TEST_F(LifetimeAnalysisTest, MultipleAssignments) { + SetupTest(R"( + MyObj* target() { + MyObj s; + MyObj* p1 = &s; + MyObj* p2 = &s; + POINT(P); + return p2; + } + )"); + // Test if atleast one loan to "s" is live; + EXPECT_THAT("s", HasLiveLoanAtExpiry("P")); +} + +TEST_F(LifetimeAnalysisTest, ConditionalAssignBothBranches) { + SetupTest(R"( + MyObj* target(bool c) { + MyObj s1; + static MyObj s2; + MyObj* p = nullptr; + if (c) { + p = &s1; + } else { + p = &s2; + } + POINT(P); + return p; + } + )"); + EXPECT_THAT("s1", HasLiveLoanAtExpiry("P")); +} + +TEST_F(LifetimeAnalysisTest, ReassignFromSafeToLocalThenReturn) { + SetupTest(R"( + MyObj* target() { + static MyObj safe_obj; + MyObj local_obj; + MyObj* p = &safe_obj; + + p = &local_obj; + POINT(P); + return p; + } + )"); + EXPECT_THAT("local_obj", HasLiveLoanAtExpiry("P")); +} + +TEST_F(LifetimeAnalysisTest, PointerChainToLocal) { + SetupTest(R"( + MyObj* target() { + MyObj local_obj; + MyObj* p1 = &local_obj; + MyObj* p2 = p1; + POINT(P); + return p2; + } + )"); + EXPECT_THAT("local_obj", HasLiveLoanAtExpiry("P")); +} + +TEST_F(LifetimeAnalysisTest, MultipleAssignmentMultipleReturn) { + SetupTest(R"( + MyObj* target(bool c1, bool c2) { + static MyObj global_obj; + MyObj local_obj1; + MyObj local_obj2; + MyObj* p = nullptr; + if(c1){ + p = &local_obj1; + POINT(C1); + return p; + } + else if(c2){ + p = &local_obj2; + POINT(C2); + return p; + } + p = &global_obj; + POINT(C3); + return p; + } + )"); + + EXPECT_THAT("local_obj1", HasLiveLoanAtExpiry("C1")); + EXPECT_THAT("local_obj2", HasLiveLoanAtExpiry("C2")); + + EXPECT_THAT("local_obj1", Not(HasLiveLoanAtExpiry("C3"))); + EXPECT_THAT("local_obj2", Not(HasLiveLoanAtExpiry("C3"))); +} + +TEST_F(LifetimeAnalysisTest, MultipleAssignmentsSingleReturn) { + SetupTest(R"( + MyObj* target(bool c1, bool c2) { + static MyObj global_obj; + MyObj local_obj1; + MyObj local_obj2; + MyObj* p = nullptr; + if(c1){ + p = &local_obj1; + } + else if(c2){ + p = &local_obj2; + } + else{ + p = &global_obj; + } + POINT(P); + return p; + } + )"); + EXPECT_THAT("local_obj1", HasLiveLoanAtExpiry("P")); + EXPECT_THAT("local_obj2", HasLiveLoanAtExpiry("P")); +} + +TEST_F(LifetimeAnalysisTest, UseAfterScopeThenReturn) { + SetupTest(R"( + MyObj* target() { + MyObj* p; + { + MyObj local_obj; + p = &local_obj; + POINT(p1); + } + POINT(p2); + return p; + } + )"); + EXPECT_THAT(Origin("p"), HasLoansTo({"local_obj"}, "p2")); + EXPECT_THAT(Origins({"p"}), MustBeLiveAt("p2")); + + EXPECT_THAT(Origin("p"), HasLoansTo({"local_obj"}, "p1")); + EXPECT_THAT(Origins({"p"}), MustBeLiveAt("p1")); + + EXPECT_THAT("local_obj", HasLiveLoanAtExpiry("p2")); +} + +TEST_F(LifetimeAnalysisTest, ReturnBeforeUseAfterScope) { + SetupTest(R"( + MyObj* target(bool c) { + MyObj* p; + static MyObj global_obj; + { + MyObj local_obj; + p = &local_obj; + if(c){ + POINT(p1); + return p; + } + } + POINT(p2); + return &global_obj; + } + )"); + EXPECT_THAT("local_obj", HasLiveLoanAtExpiry("p1")); + + EXPECT_THAT(NoOrigins(), AreLiveAt("p2")); + + EXPECT_THAT(Origin("p"), HasLoansTo({"local_obj"}, "p1")); + EXPECT_THAT(Origins({"p"}), MustBeLiveAt("p1")); +} + } // anonymous namespace } // namespace clang::lifetimes::internal diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake index d92bc0e71fa1a..37dfa5534dfef 100644 --- a/compiler-rt/cmake/base-config-ix.cmake +++ b/compiler-rt/cmake/base-config-ix.cmake @@ -59,9 +59,9 @@ if (LLVM_TREE_AVAILABLE) set(_host_executable_suffix ${CMAKE_EXECUTABLE_SUFFIX}) endif() set(COMPILER_RT_TEST_COMPILER - ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang${_host_executable_suffix}) + ${LLVM_TOOLS_BINARY_DIR}/clang${_host_executable_suffix}) set(COMPILER_RT_TEST_CXX_COMPILER - ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++${_host_executable_suffix}) + ${LLVM_TOOLS_BINARY_DIR}/clang++${_host_executable_suffix}) else() # Take output dir and install path from the user. set(COMPILER_RT_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt index ab37454acbe19..bb8298931c390 100644 --- a/compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt +++ b/compiler-rt/lib/sanitizer_common/symbolizer/CMakeLists.txt @@ -13,7 +13,7 @@ foreach(arch ${SANITIZER_COMMON_SUPPORTED_ARCH}) scripts/build_symbolizer.sh WORKING_DIRECTORY ${RTSanitizerCommonSymbolizerInternalDir} COMMAND FLAGS=${TARGET_CFLAGS} - CLANG=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang + CLANG=${LLVM_TOOLS_BINARY_DIR}/clang ${CMAKE_CURRENT_SOURCE_DIR}/scripts/build_symbolizer.sh ${CMAKE_CURRENT_BINARY_DIR}/RTSanitizerCommonSymbolizerInternal.${arch}.o USES_TERMINAL) diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h index 7afe97aac57e8..bf87654979cc9 100644 --- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h @@ -16,7 +16,9 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" namespace fir { +class AddrOfOp; class DeclareOp; +class GlobalOp; } // namespace fir namespace hlfir { @@ -53,6 +55,18 @@ struct PartialEntityAccessModel bool isCompleteView(mlir::Operation *op) const; }; +struct AddressOfGlobalModel + : public mlir::acc::AddressOfGlobalOpInterface::ExternalModel< + AddressOfGlobalModel, fir::AddrOfOp> { + mlir::SymbolRefAttr getSymbol(mlir::Operation *op) const; +}; + +struct GlobalVariableModel + : public mlir::acc::GlobalVariableOpInterface::ExternalModel< + GlobalVariableModel, fir::GlobalOp> { + bool isConstant(mlir::Operation *op) const; +}; + } // namespace fir::acc #endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp index c1734be5185f4..11fbaf2dc2bb8 100644 --- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp @@ -59,4 +59,13 @@ bool PartialEntityAccessModel::isCompleteView( return !getBaseEntity(op); } +mlir::SymbolRefAttr AddressOfGlobalModel::getSymbol(mlir::Operation *op) const { + return mlir::cast(op).getSymbolAttr(); +} + +bool GlobalVariableModel::isConstant(mlir::Operation *op) const { + auto globalOp = mlir::cast(op); + return globalOp.getConstant().has_value(); +} + } // namespace fir::acc diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp index d71c40dfac03c..5c7f9985d41ca 100644 --- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp @@ -49,6 +49,9 @@ void registerOpenACCExtensions(mlir::DialectRegistry ®istry) { PartialEntityAccessModel>(*ctx); fir::DeclareOp::attachInterface>( *ctx); + + fir::AddrOfOp::attachInterface(*ctx); + fir::GlobalOp::attachInterface(*ctx); }); // Register HLFIR operation interfaces diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index 02328042b92b3..ed4ba26ed8734 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -101,7 +101,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { // This is not a readable file. ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), - returns(EQ(0)).with_errno(NE(0))); + returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -175,7 +175,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { // Trying to read more should fetch nothing. ASSERT_THAT( LIBC_NAMESPACE::fread(read_data, sizeof(MyStruct), WRITE_NMEMB, file), - returns(EQ(0)).with_errno(EQ(0))); + returns(EQ(size_t(0))).with_errno(EQ(0))); EXPECT_NE(LIBC_NAMESPACE::feof(file), 0); EXPECT_EQ(LIBC_NAMESPACE::ferror(file), 0); ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc index 866d8903db816..8a0ec9481c595 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc +++ b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -#include -#include - #define __CLC_DECLARE_ATOM(ADDRSPACE, TYPE) \ _CLC_OVERLOAD _CLC_DECL TYPE __CLC_FUNCTION(volatile ADDRSPACE TYPE *, TYPE); diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc index 146de3412fc2e..45489df0609ca 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc +++ b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc @@ -6,9 +6,6 @@ // //===----------------------------------------------------------------------===// -#include -#include - #define __CLC_DECLARE_ATOM(ADDRSPACE, TYPE) \ _CLC_OVERLOAD _CLC_DECL TYPE __CLC_FUNCTION(volatile ADDRSPACE TYPE *, TYPE); diff --git a/libclc/opencl/lib/amdgcn/SOURCES b/libclc/opencl/lib/amdgcn/SOURCES index 213f62cc73a74..0522e13f5d3db 100644 --- a/libclc/opencl/lib/amdgcn/SOURCES +++ b/libclc/opencl/lib/amdgcn/SOURCES @@ -1,4 +1,3 @@ -cl_khr_int64_extended_atomics/minmax_helpers.ll mem_fence/fence.cl synchronization/barrier.cl workitem/get_global_offset.cl diff --git a/libclc/opencl/lib/amdgcn/cl_khr_int64_extended_atomics/minmax_helpers.ll b/libclc/opencl/lib/amdgcn/cl_khr_int64_extended_atomics/minmax_helpers.ll deleted file mode 100644 index 3ed5e99be3149..0000000000000 --- a/libclc/opencl/lib/amdgcn/cl_khr_int64_extended_atomics/minmax_helpers.ll +++ /dev/null @@ -1,55 +0,0 @@ -;;===----------------------------------------------------------------------===;; -; -; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -; See https://llvm.org/LICENSE.txt for license information. -; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -; -;;===----------------------------------------------------------------------===;; - -define i64 @__clc__sync_fetch_and_min_global_8(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %value seq_cst - ret i64 %0 -} - -define i64 @__clc__sync_fetch_and_umin_global_8(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %value seq_cst - ret i64 %0 -} - -define i64 @__clc__sync_fetch_and_min_local_8(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile min i64 addrspace(3)* %ptr, i64 %value seq_cst - ret i64 %0 -} - -define i64 @__clc__sync_fetch_and_umin_local_8(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile umin i64 addrspace(3)* %ptr, i64 %value seq_cst - ret i64 %0 -} - -define i64 @__clc__sync_fetch_and_max_global_8(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %value seq_cst - ret i64 %0 -} - -define i64 @__clc__sync_fetch_and_umax_global_8(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %value seq_cst - ret i64 %0 -} - -define i64 @__clc__sync_fetch_and_max_local_8(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile max i64 addrspace(3)* %ptr, i64 %value seq_cst - ret i64 %0 -} - -define i64 @__clc__sync_fetch_and_umax_local_8(i64 addrspace(3)* nocapture %ptr, i64 %value) nounwind alwaysinline { -entry: - %0 = atomicrmw volatile umax i64 addrspace(3)* %ptr, i64 %value seq_cst - ret i64 %0 -} diff --git a/libclc/opencl/lib/generic/atomic/atom_add.cl b/libclc/opencl/lib/generic/atomic/atom_add.cl index 08fb3fecd5bc9..368bbb790fd88 100644 --- a/libclc/opencl/lib/generic/atomic/atom_add.cl +++ b/libclc/opencl/lib/generic/atomic/atom_add.cl @@ -6,32 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_add(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_fetch_add(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_add(AS TYPE *p, TYPE val) { \ + return atom_add((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_base_atomics -#define __CLC_ATOMIC_OP add -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -#define __CLC_ATOMIC_OP add -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics #ifdef cl_khr_int64_base_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_add(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_add_8(p, val); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_and.cl b/libclc/opencl/lib/generic/atomic/atom_and.cl index 1dddd8e72f305..ffcc5bffaafac 100644 --- a/libclc/opencl/lib/generic/atomic/atom_and.cl +++ b/libclc/opencl/lib/generic/atomic/atom_and.cl @@ -6,32 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_and(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_fetch_and(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_and(AS TYPE *p, TYPE val) { \ + return atom_and((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_extended_atomics -#define __CLC_ATOMIC_OP and -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define __CLC_ATOMIC_OP and -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_and(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_and_8(p, val); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl b/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl index 5ae6aa30a8358..2e72ec529c45e 100644 --- a/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl +++ b/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl @@ -6,13 +6,20 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. #define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(volatile AS TYPE *p, TYPE cmp, \ TYPE val) { \ - return atomic_cmpxchg(p, cmp, val); \ + return __clc_atomic_compare_exchange(p, cmp, val, __ATOMIC_RELAXED, \ + __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(AS TYPE *p, TYPE cmp, TYPE val) { \ + return atom_cmpxchg((volatile AS TYPE *)p, cmp, val); \ } #ifdef cl_khr_global_int32_base_atomics @@ -24,20 +31,11 @@ __CLC_IMPL(local, int) __CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics -#undef __CLC_IMPL - #ifdef cl_khr_int64_base_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(volatile AS TYPE *p, TYPE cmp, \ - TYPE val) { \ - return __sync_val_compare_and_swap_8(p, cmp, val); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_dec.cl b/libclc/opencl/lib/generic/atomic/atom_dec.cl index af811042d307a..a1c7e58ef9e03 100644 --- a/libclc/opencl/lib/generic/atomic/atom_dec.cl +++ b/libclc/opencl/lib/generic/atomic/atom_dec.cl @@ -6,13 +6,17 @@ // //===----------------------------------------------------------------------===// +#include #include -#include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. #define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(volatile AS TYPE *p) { \ - return atomic_dec(p); \ + return __clc_atomic_dec(p, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(AS TYPE *p) { \ + return atom_dec((volatile AS TYPE *)p); \ } #ifdef cl_khr_global_int32_base_atomics @@ -24,19 +28,11 @@ __CLC_IMPL(local, int) __CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics -#undef __CLC_IMPL - #ifdef cl_khr_int64_base_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(volatile AS TYPE *p) { \ - return atom_sub(p, (TYPE)1); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_inc.cl b/libclc/opencl/lib/generic/atomic/atom_inc.cl index f881b3a3caa6d..f3636d85693b8 100644 --- a/libclc/opencl/lib/generic/atomic/atom_inc.cl +++ b/libclc/opencl/lib/generic/atomic/atom_inc.cl @@ -6,13 +6,17 @@ // //===----------------------------------------------------------------------===// -#include +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. #define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(volatile AS TYPE *p) { \ - return atomic_inc(p); \ + return __clc_atomic_inc(p, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(AS TYPE *p) { \ + return atom_inc((volatile AS TYPE *)p); \ } #ifdef cl_khr_global_int32_base_atomics @@ -24,19 +28,11 @@ __CLC_IMPL(local, int) __CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics -#undef __CLC_IMPL - #ifdef cl_khr_int64_base_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(volatile AS TYPE *p) { \ - return atom_add(p, (TYPE)1); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_int32_binary.inc b/libclc/opencl/lib/generic/atomic/atom_int32_binary.inc deleted file mode 100644 index 0afc2190263fa..0000000000000 --- a/libclc/opencl/lib/generic/atomic/atom_int32_binary.inc +++ /dev/null @@ -1,23 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include - -#define __CLC_ATOM_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __CLC_XCONCAT(atom_, __CLC_ATOMIC_OP)( \ - volatile AS TYPE * p, TYPE val) { \ - return __CLC_XCONCAT(atomic_, __CLC_ATOMIC_OP)(p, val); \ - } - -__CLC_ATOM_IMPL(__CLC_ATOMIC_ADDRESS_SPACE, int) -__CLC_ATOM_IMPL(__CLC_ATOMIC_ADDRESS_SPACE, uint) - -#undef __CLC_ATOM_IMPL -#undef __CLC_ATOMIC_OP -#undef __CLC_ATOMIC_ADDRESS_SPACE diff --git a/libclc/opencl/lib/generic/atomic/atom_max.cl b/libclc/opencl/lib/generic/atomic/atom_max.cl index 83b532ac19a1e..c2095ec36ba1e 100644 --- a/libclc/opencl/lib/generic/atomic/atom_max.cl +++ b/libclc/opencl/lib/generic/atomic/atom_max.cl @@ -6,40 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_max(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_fetch_max(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_max(AS TYPE *p, TYPE val) { \ + return atom_max((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_extended_atomics -#define __CLC_ATOMIC_OP max -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define __CLC_ATOMIC_OP max -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -unsigned long __clc__sync_fetch_and_max_local_8(volatile local long *, long); -unsigned long __clc__sync_fetch_and_max_global_8(volatile global long *, long); -unsigned long __clc__sync_fetch_and_umax_local_8(volatile local unsigned long *, - unsigned long); -unsigned long -__clc__sync_fetch_and_umax_global_8(volatile global unsigned long *, - unsigned long); - -#define __CLC_IMPL(AS, TYPE, OP) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_max(volatile AS TYPE *p, TYPE val) { \ - return __clc__sync_fetch_and_##OP##_##AS##_8(p, val); \ - } - -__CLC_IMPL(global, long, max) -__CLC_IMPL(global, unsigned long, umax) -__CLC_IMPL(local, long, max) -__CLC_IMPL(local, unsigned long, umax) -#undef __CLC_IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_min.cl b/libclc/opencl/lib/generic/atomic/atom_min.cl index b52e34769cdd8..6360d018d1e90 100644 --- a/libclc/opencl/lib/generic/atomic/atom_min.cl +++ b/libclc/opencl/lib/generic/atomic/atom_min.cl @@ -6,40 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_min(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_fetch_min(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_min(AS TYPE *p, TYPE val) { \ + return atom_min((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_extended_atomics -#define __CLC_ATOMIC_OP min -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define __CLC_ATOMIC_OP min -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -unsigned long __clc__sync_fetch_and_min_local_8(volatile local long *, long); -unsigned long __clc__sync_fetch_and_min_global_8(volatile global long *, long); -unsigned long __clc__sync_fetch_and_umin_local_8(volatile local unsigned long *, - unsigned long); -unsigned long -__clc__sync_fetch_and_umin_global_8(volatile global unsigned long *, - unsigned long); - -#define __CLC_IMPL(AS, TYPE, OP) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_min(volatile AS TYPE *p, TYPE val) { \ - return __clc__sync_fetch_and_##OP##_##AS##_8(p, val); \ - } - -__CLC_IMPL(global, long, min) -__CLC_IMPL(global, unsigned long, umin) -__CLC_IMPL(local, long, min) -__CLC_IMPL(local, unsigned long, umin) -#undef __CLC_IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_or.cl b/libclc/opencl/lib/generic/atomic/atom_or.cl index fa9737f5f28ee..ad28aa436de8c 100644 --- a/libclc/opencl/lib/generic/atomic/atom_or.cl +++ b/libclc/opencl/lib/generic/atomic/atom_or.cl @@ -6,32 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_or(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_fetch_or(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_or(AS TYPE *p, TYPE val) { \ + return atom_or((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_extended_atomics -#define __CLC_ATOMIC_OP or -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define __CLC_ATOMIC_OP or -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_or(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_or_8(p, val); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_sub.cl b/libclc/opencl/lib/generic/atomic/atom_sub.cl index 9a8acfa9116b8..9daaa1b3ce154 100644 --- a/libclc/opencl/lib/generic/atomic/atom_sub.cl +++ b/libclc/opencl/lib/generic/atomic/atom_sub.cl @@ -6,32 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_sub(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_fetch_sub(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_sub(AS TYPE *p, TYPE val) { \ + return atom_sub((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_base_atomics -#define __CLC_ATOMIC_OP sub -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -#define __CLC_ATOMIC_OP sub -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics #ifdef cl_khr_int64_base_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_sub(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_sub_8(p, val); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_xchg.cl b/libclc/opencl/lib/generic/atomic/atom_xchg.cl index 03f8a9c466c5f..5b75873f29760 100644 --- a/libclc/opencl/lib/generic/atomic/atom_xchg.cl +++ b/libclc/opencl/lib/generic/atomic/atom_xchg.cl @@ -6,32 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_exchange(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(AS TYPE *p, TYPE val) { \ + return atom_xchg((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_base_atomics -#define __CLC_ATOMIC_OP xchg -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -#define __CLC_ATOMIC_OP xchg -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics #ifdef cl_khr_int64_base_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(volatile AS TYPE *p, TYPE val) { \ - return __sync_swap_8(p, val); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_xor.cl b/libclc/opencl/lib/generic/atomic/atom_xor.cl index 392a4b794c698..21aba01267e18 100644 --- a/libclc/opencl/lib/generic/atomic/atom_xor.cl +++ b/libclc/opencl/lib/generic/atomic/atom_xor.cl @@ -6,32 +6,35 @@ // //===----------------------------------------------------------------------===// +#include #include -#include + +// Non-volatile overloads are for backward compatibility with OpenCL 1.0. + +#define __CLC_IMPL(AS, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_xor(volatile AS TYPE *p, TYPE val) { \ + return __clc_atomic_fetch_xor(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ + } \ + _CLC_OVERLOAD _CLC_DEF TYPE atom_xor(AS TYPE *p, TYPE val) { \ + return atom_xor((volatile AS TYPE *)p, val); \ + } #ifdef cl_khr_global_int32_extended_atomics -#define __CLC_ATOMIC_OP xor -#define __CLC_ATOMIC_ADDRESS_SPACE global -#include "atom_int32_binary.inc" +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define __CLC_ATOMIC_OP xor -#define __CLC_ATOMIC_ADDRESS_SPACE local -#include "atom_int32_binary.inc" +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define __CLC_IMPL(AS, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE atom_xor(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_xor_8(p, val); \ - } - __CLC_IMPL(global, long) __CLC_IMPL(global, unsigned long) __CLC_IMPL(local, long) __CLC_IMPL(local, unsigned long) -#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atomic_add.cl b/libclc/opencl/lib/generic/atomic/atomic_add.cl index d005c1dd6ac51..5501d30544e7c 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_add.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_add.cl @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_add(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_add(p, val); \ + return __clc_atomic_fetch_add(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global) diff --git a/libclc/opencl/lib/generic/atomic/atomic_and.cl b/libclc/opencl/lib/generic/atomic/atomic_and.cl index 12558568b0e4e..ce1adbb6f8235 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_and.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_and.cl @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_and(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_and(p, val); \ + return __clc_atomic_fetch_and(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global) diff --git a/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl b/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl index 1045020a553fc..16a8db43e9374 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl @@ -6,12 +6,15 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_cmpxchg(volatile AS TYPE *p, TYPE cmp, \ TYPE val) { \ - return __sync_val_compare_and_swap(p, cmp, val); \ + return __clc_atomic_compare_exchange(p, cmp, val, __ATOMIC_RELAXED, \ + __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global) diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc index 03eb5d1b33057..dd8dafb38c883 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc +++ b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc @@ -13,7 +13,7 @@ #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr) { \ - return __CLC_IMPL_FUNCTION(Ptr, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + return __CLC_IMPL_FUNCTION(Ptr, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); \ } __CLC_DEFINE_ATOMIC(global) diff --git a/libclc/opencl/lib/generic/atomic/atomic_max.cl b/libclc/opencl/lib/generic/atomic/atomic_max.cl index aa482a8f46397..362a0ed90ca0e 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_max.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_max.cl @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS, OP) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_max(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_##OP(p, val); \ + return __clc_atomic_fetch_max(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global, max) diff --git a/libclc/opencl/lib/generic/atomic/atomic_min.cl b/libclc/opencl/lib/generic/atomic/atomic_min.cl index 7f39e94316846..1976be0014d70 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_min.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_min.cl @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS, OP) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_min(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_##OP(p, val); \ + return __clc_atomic_fetch_min(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global, min) diff --git a/libclc/opencl/lib/generic/atomic/atomic_or.cl b/libclc/opencl/lib/generic/atomic/atomic_or.cl index ad14cd2178555..ef8bc00f45593 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_or.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_or.cl @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_or(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_or(p, val); \ + return __clc_atomic_fetch_or(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global) diff --git a/libclc/opencl/lib/generic/atomic/atomic_sub.cl b/libclc/opencl/lib/generic/atomic/atomic_sub.cl index 2e51c4c2ce02f..397737d113c0d 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_sub.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_sub.cl @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_sub(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_sub(p, val); \ + return __clc_atomic_fetch_sub(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global) diff --git a/libclc/opencl/lib/generic/atomic/atomic_xchg.cl b/libclc/opencl/lib/generic/atomic/atomic_xchg.cl index 2585a5427392e..2b4bbf06d9400 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_xchg.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_xchg.cl @@ -6,24 +6,19 @@ // //===----------------------------------------------------------------------===// -#include +#include #include -_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile global float *p, float val) { - return as_float(atomic_xchg((volatile global uint *)p, as_uint(val))); -} - -_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile local float *p, float val) { - return as_float(atomic_xchg((volatile local uint *)p, as_uint(val))); -} - #define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_xchg(volatile AS TYPE *p, TYPE val) { \ - return __sync_swap_4(p, val); \ + return __clc_atomic_exchange(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global) __CLC_IMPL(unsigned int, global) +__CLC_IMPL(float, global) __CLC_IMPL(int, local) __CLC_IMPL(unsigned int, local) +__CLC_IMPL(float, local) #undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_xor.cl b/libclc/opencl/lib/generic/atomic/atomic_xor.cl index 0228134397464..1f200c58edbff 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_xor.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_xor.cl @@ -6,11 +6,13 @@ // //===----------------------------------------------------------------------===// +#include #include #define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_xor(volatile AS TYPE *p, TYPE val) { \ - return __sync_fetch_and_xor(p, val); \ + return __clc_atomic_fetch_xor(p, val, __ATOMIC_RELAXED, \ + __MEMORY_SCOPE_DEVICE); \ } __CLC_IMPL(int, global) diff --git a/lld/test/ELF/dso-undef-extract-lazy.s b/lld/test/ELF/dso-undef-extract-lazy.s index 40b0758957d7a..5d925453f27d7 100644 --- a/lld/test/ELF/dso-undef-extract-lazy.s +++ b/lld/test/ELF/dso-undef-extract-lazy.s @@ -25,6 +25,11 @@ # CHECK-FETCH: GLOBAL DEFAULT {{[0-9]+}} foo +## Unversioned undefined symbols also extract the archive definitions. +# RUN: yaml2obj %t/ver.yaml -o %t4.so +# RUN: ld.lld %t1.o %t4.so %t2.a -o %t.exe +# RUN: llvm-readelf --dyn-symbols %t.exe | FileCheck %s --check-prefix=CHECK-FETCH + #--- main.s .text .globl _start @@ -38,3 +43,39 @@ foo: #--- shlib.s .global foo + +#--- ver.yaml +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] + Address: 0x0000000000200210 + AddressAlign: 0x0000000000000002 + EntSize: 0x0000000000000002 +## Test both index 0 and 1 for unversioned undefined symbols. +## https://sourceware.org/PR33577 + Entries: [ 0, 0, 1 ] + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + Address: 0x0000000000200250 + AddressAlign: 0x0000000000000004 + Dependencies: + - Version: 1 + File: dso.so.0 + Entries: + - Name: v1 + Hash: 1937 + Flags: 0 + Other: 3 +DynamicSymbols: + - Name: _start + Binding: STB_GLOBAL + - Name: foo + Binding: STB_GLOBAL diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index f2b168f6cb0e3..5f7fb00889655 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -796,6 +796,112 @@ every time. For more information, refer PTX ISA Membar/Fences ------------- +'``llvm.nvvm.fence.acquire/release.sync_restrict.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +The `nvvm.fence.{semantics}.sync_restrict.*` restrict the class of memory +operations for which the fence instruction provides the memory ordering guarantees. +When `.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. For more details, +please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.mbarrier_init.release.cluster``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.mbarrier_init.release.cluster() + +Overview: +""""""""" + +`nvvm.fence.mbarrier_init.release.cluster` intrinsic restrict the class of +memory operations for which the fence instruction provides the memory ordering +guarantees. The `mbarrier_init` modifiers restricts the synchronizing effect to +the prior `mbarrier_init` operation executed by the same thread on mbarrier objects +in `shared_cta` memory space. For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.async_generic.acquire/release.sync_restrict``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.async.generic.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.proxy.async.generic.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +`nvvm.fence.proxy.async_generic.{semantics}.sync_restrict` are used to establish +ordering between a prior memory access performed via the `async proxy__` +and a subsequent memory access performed via the generic proxy. +``nvvm.fence.proxy.async_generic.release.sync_restrict`` can form a release +sequence that synchronizes with an acquire sequence that contains the +``nvvm.fence.proxy.async_generic.acquire.sync_restrict`` proxy fence. When +`.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. +For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.alias() + declare void @llvm.nvvm.fence.proxy.async() + declare void @llvm.nvvm.fence.proxy.async.global() + declare void @llvm.nvvm.fence.proxy.async.shared_cluster() + declare void @llvm.nvvm.fence.proxy.async.shared_cta() + +Overview: +""""""""" + +`nvvm.fence.proxy.{proxykind}` intrinsics represent a fence with bi-directional +proxy ordering that is established between the memory accesses done between the +`generic proxy__` +and the proxy specified by `proxykind`. A `bi-directional proxy` ordering between +two proxykinds establishes two `uni-directional` proxy orderings: one from the +first proxykind to the second proxykind and the other from the second proxykind +to the first proxykind. + +`alias` proxykind refers to memory accesses performed using virtually aliased +addresses to the same memory location + +`async` proxykind specifies that the memory ordering is established between the +`async proxy` and the `generic proxy`. The memory ordering is limited only to +operations performed on objects in the state space specified (`generic`, `global`, +`shared_cluster`, `shared_cta`). If no state space is specified, then the memory +ordering applies on all state spaces. For more details, please refer the +`PTX ISA `__ + '``llvm.nvvm.fence.proxy.tensormap_generic.*``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm-c/LLJITUtils.h b/llvm/include/llvm-c/LLJITUtils.h index 4064d5907bc8a..4d6641f40b710 100644 --- a/llvm/include/llvm-c/LLJITUtils.h +++ b/llvm/include/llvm-c/LLJITUtils.h @@ -40,7 +40,7 @@ LLVM_C_EXTERN_C_BEGIN /** * Install the plugin that submits debug objects to the executor. Executors must - * expose the llvm_orc_registerJITLoaderGDBWrapper symbol. + * expose the llvm_orc_registerJITLoaderGDBAllocAction symbol. */ LLVM_C_ABI LLVMErrorRef LLVMOrcLLJITEnableDebugSupport(LLVMOrcLLJITRef J); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0f17312b03827..a65e4667ab76c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -123,6 +123,32 @@ struct HardwareLoopInfo { LLVM_ABI bool canAnalyze(LoopInfo &LI); }; +/// Information for memory intrinsic cost model. +class MemIntrinsicCostAttributes { + /// Vector type of the data to be loaded or stored. + Type *DataTy = nullptr; + + /// ID of the memory intrinsic. + Intrinsic::ID IID; + + /// Address space of the pointer. + unsigned AddressSpace = 0; + + /// Alignment of single element. + Align Alignment; + +public: + LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy, + Align Alignment, unsigned AddressSpace) + : DataTy(DataTy), IID(Id), AddressSpace(AddressSpace), + Alignment(Alignment) {} + + Intrinsic::ID getID() const { return IID; } + Type *getDataType() const { return DataTy; } + unsigned getAddressSpace() const { return AddressSpace; } + Align getAlignment() const { return Alignment; } +}; + class IntrinsicCostAttributes { const IntrinsicInst *II = nullptr; Type *RetTy = nullptr; @@ -1556,7 +1582,7 @@ class TargetTransformInfo { /// \return The cost of masked Load and Store instructions. LLVM_ABI InstructionCost getMaskedMemoryOpCost( - unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// \return The cost of Gather or Scatter operation diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index aacb88d2f9684..d8e35748f53e5 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -842,8 +842,7 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 944e1714e8f98..cb389ae74ef46 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1558,9 +1558,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override { + Type *DataTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned Opcode = MICA.getID() == Intrinsic::masked_load + ? Instruction::Load + : Instruction::Store; // TODO: Pass on AddressSpace when we have test coverage. return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, CostKind); @@ -1617,10 +1621,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Firstly, the cost of load/store operation. InstructionCost Cost; - if (UseMaskForCond || UseMaskForGaps) - Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, - AddressSpace, CostKind); - else + if (UseMaskForCond || UseMaskForGaps) { + unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load + : Intrinsic::masked_store; + Cost = thisT()->getMaskedMemoryOpCost( + {IID, VecTy, Alignment, AddressSpace}, CostKind); + } else Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); @@ -2403,14 +2409,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case Intrinsic::masked_store: { Type *Ty = Tys[0]; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, - CostKind); + return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind); } case Intrinsic::masked_load: { Type *Ty = RetTy; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, - CostKind); + return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind); } case Intrinsic::experimental_vp_strided_store: { auto *Ty = cast(ICA.getArgTypes()[0]); diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h index 1581f7aca211e..80c2b95ca1c7b 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h @@ -14,7 +14,6 @@ #define LLVM_EXECUTIONENGINE_ORC_DEBUGGERSUPPORTPLUGIN_H #include "llvm/ExecutionEngine/Orc/Core.h" -#include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/Support/Compiler.h" diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h deleted file mode 100644 index 3a92eee0430c5..0000000000000 --- a/llvm/include/llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h +++ /dev/null @@ -1,69 +0,0 @@ -//===- EPCDebugObjectRegistrar.h - EPC-based debug registration -*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// ExecutorProcessControl based registration of debug objects. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_EXECUTIONENGINE_ORC_EPCDEBUGOBJECTREGISTRAR_H -#define LLVM_EXECUTIONENGINE_ORC_EPCDEBUGOBJECTREGISTRAR_H - -#include "llvm/ExecutionEngine/JITSymbol.h" -#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h" -#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/Memory.h" - -#include -#include - -namespace llvm { -namespace orc { - -class ExecutionSession; - -/// Abstract interface for registering debug objects in the executor process. -class DebugObjectRegistrar { -public: - virtual Error registerDebugObject(ExecutorAddrRange TargetMem, - bool AutoRegisterCode) = 0; - virtual ~DebugObjectRegistrar() = default; -}; - -/// Use ExecutorProcessControl to register debug objects locally or in a remote -/// executor process. -class LLVM_ABI EPCDebugObjectRegistrar : public DebugObjectRegistrar { -public: - EPCDebugObjectRegistrar(ExecutionSession &ES, ExecutorAddr RegisterFn) - : ES(ES), RegisterFn(RegisterFn) {} - - Error registerDebugObject(ExecutorAddrRange TargetMem, - bool AutoRegisterCode) override; - -private: - ExecutionSession &ES; - ExecutorAddr RegisterFn; -}; - -/// Create a ExecutorProcessControl-based DebugObjectRegistrar that emits debug -/// objects to the GDB JIT interface. This will use the EPC's lookupSymbols -/// method to find the registration/deregistration function addresses by name. -/// -/// If RegistrationFunctionsDylib is non-None then it will be searched to find -/// the registration functions. If it is None then the process dylib will be -/// loaded to find the registration functions. -LLVM_ABI Expected> -createJITLoaderGDBRegistrar( - ExecutionSession &ES, - std::optional RegistrationFunctionDylib = std::nullopt); - -} // end namespace orc -} // end namespace llvm - -#endif // LLVM_EXECUTIONENGINE_ORC_EPCDEBUGOBJECTREGISTRAR_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h index 5a9517b1ce7cc..bc3c6fa332a2d 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h @@ -43,9 +43,6 @@ struct jit_descriptor { }; } -extern "C" LLVM_ABI llvm::orc::shared::CWrapperFunctionResult -llvm_orc_registerJITLoaderGDBWrapper(const char *ArgData, size_t ArgSize); - extern "C" LLVM_ABI llvm::orc::shared::CWrapperFunctionResult llvm_orc_registerJITLoaderGDBAllocAction(const char *ArgData, size_t ArgSize); diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 1b485dc8ccd1e..be4f99aaaa241 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1566,14 +1566,19 @@ let TargetPrefix = "nvvm" in { foreach rnd = ["rn", "rz"] in { foreach relu = ["", "_relu"] in { - def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, - PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, - PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, - PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; + foreach satfinite = ["", "_satfinite"] in { + def int_nvvm_ff2bf16x2_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_ff2f16x2_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_f2bf16_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; + + def int_nvvm_f2f16_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_half_ty], [llvm_float_ty]>; + } } } @@ -1746,32 +1751,64 @@ let TargetPrefix = "nvvm" in { def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>; } - // - // Membar - // - let IntrProperties = [IntrNoCallback] in { +// +// Membar / Fence +// +let IntrProperties = [IntrNoCallback] in { def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_fence_sc_cluster : Intrinsic<[]>; - } - // - // Proxy fence (uni-directional) - // + // Operation fence + def int_nvvm_fence_mbarrier_init_release_cluster: Intrinsic<[], [], [], + "llvm.nvvm.fence.mbarrier_init.release.cluster">; + + // Thread fence + def int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster">; + +// +// Proxy fence (uni-directional) +// + + def int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster">; + foreach scope = ["cta", "cluster", "gpu", "sys"] in { def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : - Intrinsic<[], [], [IntrNoCallback], + Intrinsic<[], [], [], "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; // The imm-arg 'size' can only be 128. def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoCallback, IntrArgMemOnly, ImmArg>, - Range, 128, 129>], - "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [], + "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope> { + let IntrProperties = [IntrNoCallback, IntrArgMemOnly, + ImmArg>, Range, 128, 129>]; + } + } + +// +// Proxy fence (bi-directional) +// + foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def Intr.record_name: Intrinsic<[], [], [], Intr.intr_name>; } +} // // Async Copy diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index db99885121ec1..6abde996e6dc8 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1579,6 +1579,99 @@ def CONVERGENCECTRL_GLUE : StandardPseudoInstruction { } } +/// Allow a target to replace the instruction definition of a +/// StandardPseudoInstruction. A target should only define one +/// instance of this per instruction. +/// +/// This is intended to allow targets to specify the register class +/// used for pointers. It should not be used to change the fundamental +/// operand structure (e.g., this should not add or remove operands, +/// or change the operand types). +class TargetSpecializedStandardPseudoInstruction< + StandardPseudoInstruction base_inst> : Instruction { + + StandardPseudoInstruction Instruction = base_inst; + let OutOperandList = base_inst.OutOperandList; + let InOperandList = base_inst.InOperandList; + + // TODO: Copy everything + let usesCustomInserter = base_inst.usesCustomInserter; + let hasSideEffects = base_inst.hasSideEffects; + let mayLoad = base_inst.mayLoad; + let mayStore = base_inst.mayStore; + let isTerminator = base_inst.isTerminator; + let isBranch = base_inst.isBranch; + let isIndirectBranch = base_inst.isIndirectBranch; + let isEHScopeReturn = base_inst.isEHScopeReturn; + let isReturn = base_inst.isReturn; + let isCall = base_inst.isCall; + let hasCtrlDep = base_inst.hasCtrlDep; + let isReMaterializable = base_inst.isReMaterializable; + let isMeta = base_inst.isMeta; + let Size = base_inst.Size; + let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; + let isPseudo = true; + let hasNoSchedulingInfo = true; + let isNotDuplicable = base_inst.isNotDuplicable; + let isConvergent = base_inst.isConvergent; + let hasExtraSrcRegAllocReq = base_inst.hasExtraSrcRegAllocReq; + let hasExtraDefRegAllocReq = base_inst.hasExtraDefRegAllocReq; +} + +// All pseudo instructions which need a pointer register class, which +// should be specialized by a target. +defvar PseudosWithPtrOps = [ + LOAD_STACK_GUARD, + PREALLOCATED_ARG, + PATCHABLE_EVENT_CALL, + PATCHABLE_TYPED_EVENT_CALL +]; + + +/// Replace PointerLikeRegClass operands in OperandList with new_rc. +class RemapPointerOperandList { + // Collect the set of names so we can query and rewrite them. + list op_names = !foreach(i, !range(!size(OperandList)), + !getdagname(OperandList, i)); + + // Beautiful language. This would be a lot easier if !getdagarg + // didn't require a specific type. We can't just collect a list of + // the operand values and reconstruct the dag, since there isn't a + // common base class for all the field kinds used in + // pseudoinstruction definitions; therefore everything must be + // maintained as a dag, so use a foldl. Additionally, ? doesn't + // evaluate as false so we get even more noise. + dag ret = + !foldl(OperandList, op_names, acc, name, + !cond( + !initialized(!getdagarg(OperandList, name)) + : !setdagarg(acc, name, new_rc), + !initialized(!getdagarg(OperandList, name)) : acc, + !initialized(!getdagarg(OperandList, name)) : acc + ) + ); +} + +/// Define an override for a pseudoinstruction which uses a pointer +/// register class, specialized to the target's pointer type. +class RemapPointerOperands : + TargetSpecializedStandardPseudoInstruction { + let OutOperandList = + RemapPointerOperandList.ret; + let InOperandList = + RemapPointerOperandList.ret; +} + +/// Helper to replace all pseudoinstructions using pointers to a +/// target register class. Most targets should use this. +multiclass RemapAllTargetPseudoPointerOperands< + RegisterClassLike default_ptr_rc> { + foreach inst = PseudosWithPtrOps in { + def : RemapPointerOperands; + } +} + // Generic opcodes used in GlobalISel. include "llvm/Target/GenericOpcodes.td" diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0426ac7e62fab..45369f0ffe137 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1183,10 +1183,9 @@ InstructionCost TargetTransformInfo::getMemoryOpCost( } InstructionCost TargetTransformInfo::getMaskedMemoryOpCost( - unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { - InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, - AddressSpace, CostKind); + InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(MICA, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 9dfb6af58323a..1099faca9fa46 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -459,6 +459,15 @@ void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, addBlock(Die, Attribute, Block->BestForm(), Block); } +void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, + const DIExpression *Expr) { + DIELoc *Loc = new (DIEValueAllocator) DIELoc; + DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); + DwarfExpr.setMemoryLocationKind(); + DwarfExpr.addExpression(Expr); + addBlock(Die, Attribute, DwarfExpr.finalize()); +} + void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, unsigned Column, const DIFile *File) { if (Line == 0) @@ -842,27 +851,14 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_string_length, *VarDIE); } else if (DIExpression *Expr = STy->getStringLengthExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - // This is to describe the memory location of the - // length of a Fortran deferred length string, so - // lock it down as such. - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_string_length, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_string_length, Expr); } else { uint64_t Size = STy->getSizeInBits() >> 3; addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size); } if (DIExpression *Expr = STy->getStringLocationExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - // This is to describe the memory location of the - // string, so lock it down as such. - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_data_location, Expr); } if (STy->getEncoding()) { @@ -1618,11 +1614,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &DW_Subrange, const DISubrangeType *SR, if (auto *VarDIE = getDIE(BV)) addDIEEntry(DW_Subrange, Attr, *VarDIE); } else if (auto *BE = dyn_cast_if_present(Bound)) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DW_Subrange, Attr, DwarfExpr.finalize()); + addBlock(DW_Subrange, Attr, BE); } else if (auto *BI = dyn_cast_if_present(Bound)) { if (Attr == dwarf::DW_AT_GNU_bias) { if (BI->getSExtValue() != 0) @@ -1660,11 +1652,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR) { if (auto *VarDIE = getDIE(BV)) addDIEEntry(DW_Subrange, Attr, *VarDIE); } else if (auto *BE = dyn_cast_if_present(Bound)) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DW_Subrange, Attr, DwarfExpr.finalize()); + addBlock(DW_Subrange, Attr, BE); } else if (auto *BI = dyn_cast_if_present(Bound)) { if (Attr == dwarf::DW_AT_count) { if (BI->getSExtValue() != -1) @@ -1710,11 +1698,7 @@ void DwarfUnit::constructGenericSubrangeDIE(DIE &Buffer, addSInt(DwGenericSubrange, Attr, dwarf::DW_FORM_sdata, BE->getElement(1)); } else { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DwGenericSubrange, Attr, DwarfExpr.finalize()); + addBlock(DwGenericSubrange, Attr, BE); } } }; @@ -1781,44 +1765,28 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_data_location, *VarDIE); } else if (DIExpression *Expr = CTy->getDataLocationExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_data_location, Expr); } if (DIVariable *Var = CTy->getAssociated()) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_associated, *VarDIE); } else if (DIExpression *Expr = CTy->getAssociatedExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_associated, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_associated, Expr); } if (DIVariable *Var = CTy->getAllocated()) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_allocated, *VarDIE); } else if (DIExpression *Expr = CTy->getAllocatedExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_allocated, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_allocated, Expr); } if (auto *RankConst = CTy->getRankConst()) { addSInt(Buffer, dwarf::DW_AT_rank, dwarf::DW_FORM_sdata, RankConst->getSExtValue()); } else if (auto *RankExpr = CTy->getRankExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(RankExpr); - addBlock(Buffer, dwarf::DW_AT_rank, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_rank, RankExpr); } if (auto *BitStride = CTy->getBitStrideConst()) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 7841ff7fa5952..6875c415057d2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -218,6 +218,9 @@ class DwarfUnit : public DIEUnit { void addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form, DIEBlock *Block); + /// Add an expression as block data. + void addBlock(DIE &Die, dwarf::Attribute Attribute, const DIExpression *Expr); + /// Add location information to specified debug information entry. void addSourceLine(DIE &Die, unsigned Line, unsigned Column, const DIFile *File); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 45a08347b1ec2..f0fbe0135353f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -3499,6 +3499,9 @@ bool CombinerHelper::matchCombineBuildUnmerge(MachineInstr &MI, LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); LLT UnmergeSrcTy = MRI.getType(UnmergeSrc); + if (!UnmergeSrcTy.isVector()) + return false; + // Ensure we only generate legal instructions post-legalizer if (!IsPreLegalize && !isLegal({TargetOpcode::G_CONCAT_VECTORS, {DstTy, UnmergeSrcTy}})) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 71facc7e1d553..56909bb8631c1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5702,6 +5702,9 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, return false; } + case ISD::VECTOR_COMPRESS: + return false; + default: // Allow the target to implement this method for its nodes. if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN || diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt index 41402f7a69ccb..422b20c649c70 100644 --- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt @@ -17,7 +17,6 @@ add_llvm_component_library(LLVMOrcJIT DebugUtils.cpp EHFrameRegistrationPlugin.cpp EPCDynamicLibrarySearchGenerator.cpp - EPCDebugObjectRegistrar.cpp EPCGenericDylibManager.cpp EPCGenericJITLinkMemoryManager.cpp EPCGenericRTDyldMemoryManager.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp deleted file mode 100644 index 08bef37b06c82..0000000000000 --- a/llvm/lib/ExecutionEngine/Orc/EPCDebugObjectRegistrar.cpp +++ /dev/null @@ -1,61 +0,0 @@ -//===----- EPCDebugObjectRegistrar.cpp - EPC-based debug registration -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" - -#include "llvm/ExecutionEngine/Orc/Core.h" - -namespace llvm { -namespace orc { - -Expected> createJITLoaderGDBRegistrar( - ExecutionSession &ES, - std::optional RegistrationFunctionDylib) { - auto &EPC = ES.getExecutorProcessControl(); - - if (!RegistrationFunctionDylib) { - if (auto D = EPC.getDylibMgr().loadDylib(nullptr)) - RegistrationFunctionDylib = *D; - else - return D.takeError(); - } - - SymbolStringPtr RegisterFn = - EPC.getTargetTriple().isOSBinFormatMachO() - ? EPC.intern("_llvm_orc_registerJITLoaderGDBWrapper") - : EPC.intern("llvm_orc_registerJITLoaderGDBWrapper"); - - SymbolLookupSet RegistrationSymbols; - RegistrationSymbols.add(RegisterFn); - - auto Result = EPC.getDylibMgr().lookupSymbols( - {{*RegistrationFunctionDylib, RegistrationSymbols}}); - if (!Result) - return Result.takeError(); - - assert(Result->size() == 1 && "Unexpected number of dylibs in result"); - assert((*Result)[0].size() == 1 && - "Unexpected number of addresses in result"); - - if (!(*Result)[0][0].has_value()) - return make_error( - "Expected a valid address in the lookup result", - inconvertibleErrorCode()); - - ExecutorAddr RegisterAddr = (*Result)[0][0]->getAddress(); - return std::make_unique(ES, RegisterAddr); -} - -Error EPCDebugObjectRegistrar::registerDebugObject(ExecutorAddrRange TargetMem, - bool AutoRegisterCode) { - return ES.callSPSWrapper( - RegisterFn, TargetMem, AutoRegisterCode); -} - -} // namespace orc -} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp index cb1b3b05cd24d..f255de093b24f 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp @@ -88,19 +88,3 @@ llvm_orc_registerJITLoaderGDBAllocAction(const char *ArgData, size_t ArgSize) { }) .release(); } - -extern "C" orc::shared::CWrapperFunctionResult -llvm_orc_registerJITLoaderGDBWrapper(const char *ArgData, size_t ArgSize) { - using namespace orc::shared; - return WrapperFunction::handle( - ArgData, ArgSize, - [](ExecutorAddrRange R, bool AutoRegisterCode) { - appendJITDebugDescriptor(R.Start.toPtr(), - R.size()); - // Run into the rendezvous breakpoint. - if (AutoRegisterCode) - __jit_debug_register_code(); - return Error::success(); - }) - .release(); -} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index bf195ca210e9b..0bae00bafee3c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4730,12 +4730,12 @@ bool AArch64TTIImpl::prefersVectorizedAddressing() const { } InstructionCost -AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, unsigned AddressSpace, +AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + Type *Src = MICA.getDataType(); + if (useNeonVector(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); auto LT = getTypeLegalizationCost(Src); if (!LT.first.isValid()) return InstructionCost::getInvalid(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d189f563f99a1..6cc4987428567 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -188,8 +188,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase { unsigned Opcode2) const; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index fdf69b04bf676..0dd680f9a58d9 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1242,7 +1242,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor({{v16s8, v8s8}, {v8s16, v4s16}, {v4s32, v2s32}}) .bitcastIf( [=](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() <= 128 && + return Query.Types[0].isFixedVector() && + Query.Types[1].isFixedVector() && + Query.Types[0].getSizeInBits() <= 128 && Query.Types[1].getSizeInBits() <= 64; }, [=](const LegalityQuery &Query) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp index 30a1f05a8a390..2e586ea207af5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -27,8 +27,17 @@ using namespace llvm; namespace { class BarrierLatency : public ScheduleDAGMutation { +private: + SmallSet IgnoredScopes; + public: - BarrierLatency() = default; + BarrierLatency(MachineFunction *MF) { + LLVMContext &Context = MF->getFunction().getContext(); + IgnoredScopes.insert(SyncScope::SingleThread); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as")); + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as")); + } void apply(ScheduleDAGInstrs *DAG) override; }; @@ -40,8 +49,11 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { continue; // Update latency on barrier edges of ATOMIC_FENCE. - // We don't consider the scope of the fence or type of instruction - // involved in the barrier edge. + // Ignore scopes not expected to have any latency. + SyncScope::ID SSID = static_cast(MI->getOperand(1).getImm()); + if (IgnoredScopes.contains(SSID)) + continue; + for (SDep &PredDep : SU.Preds) { if (!PredDep.isBarrier()) continue; @@ -68,6 +80,6 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) { } // end namespace std::unique_ptr -llvm::createAMDGPUBarrierLatencyDAGMutation() { - return std::make_unique(); +llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique(MF); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h index c23f0b99fe822..547cd2a11f7df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h @@ -14,7 +14,10 @@ namespace llvm { -std::unique_ptr createAMDGPUBarrierLatencyDAGMutation(); +class MachineFunction; + +std::unique_ptr +createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ac0cb549d020b..41986fef213f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1850,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = + Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index e36c57ad59bfd..097457f9f0deb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -530,6 +530,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + addRulesForGOpcs({G_FSHR}, Standard) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}); + addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}}); addRulesForGOpcs({G_UBFX, G_SBFX}, Standard) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 39c76ebff2e05..f3d152d47c33d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -648,7 +648,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -669,7 +669,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } @@ -1241,7 +1241,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { EnableVOPD) DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); - DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation()); + DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ca98b80787fb4..a87f9f274a4d3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1423,7 +1423,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the target has packed f32 instructions that only read 32 /// bits from a scalar operand (SGPR or literal) and replicates the bits to /// both channels. - bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const { + bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const { return getGeneration() == GFX12 && GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index cfc0c16ea30e5..8a959125de402 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -766,29 +766,21 @@ static void appendFoldCandidate(SmallVectorImpl &FoldList, FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } -// Returns true if the instruction is a packed f32 instruction that only reads -// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to -// both channels. -static bool -isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, - MachineInstr *MI) { - if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput()) +// Returns true if the instruction is a packed F32 instruction and the +// corresponding scalar operand reads 32 bits and replicates the bits to both +// channels. +static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand( + const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) { + if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput()) return false; - switch (MI->getOpcode()) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("unknown instruction"); + const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo]; + return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; } // Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or // literal) and replicates the bits to both channels. Therefore, if the hi and // lo are not same, we can't fold it. -static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput( +static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand( const FoldableDef &OpToFold) { assert(OpToFold.isImm() && "Expected immediate operand"); uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); @@ -953,8 +945,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( // Special case for PK_F32 instructions if we are trying to fold an imm to // src0 or src1. if (OpToFold.isImm() && - isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) && - !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) return false; appendFoldCandidate(FoldList, MI, OpNo, OpToFold); @@ -1171,8 +1163,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { - if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) && - !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 24f58a68c345d..d12b802fe234f 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1631,20 +1631,22 @@ InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, +ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned IID = MICA.getID(); + Type *Src = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); if (ST->hasMVEIntegerOps()) { - if (Opcode == Instruction::Load && + if (IID == Intrinsic::masked_load && isLegalMaskedLoad(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); - if (Opcode == Instruction::Store && + if (IID == Intrinsic::masked_store && isLegalMaskedStore(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); } if (!isa(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); // Scalar cost, which is currently very high due to the efficiency of the // generated code. return cast(Src)->getNumElements() * 8; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 0810c5532ed91..919a6fc9fd0b0 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -275,8 +275,7 @@ class ARMTTIImpl final : public BasicTTIImplBase { const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index e925e041eb64e..8f3f0cc8abb01 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -224,11 +224,9 @@ InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, unsigned AddressSpace, +HexagonTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); } InstructionCost diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index cec2bf9656ffc..e95b5a10b76a7 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -120,8 +120,7 @@ class HexagonTTIImpl final : public BasicTTIImplBase { TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 18b3a5ce914e8..8b129e7e5eeae 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -595,6 +595,15 @@ let hasSideEffects = false in { defm CVT_bf16 : CVT_FROM_ALL<"bf16", B16, [hasPTX<78>, hasSM<90>]>; defm CVT_f32 : CVT_FROM_ALL<"f32", B32>; defm CVT_f64 : CVT_FROM_ALL<"f64", B64>; + + multiclass CVT_FROM_FLOAT_SATFINITE { + def _f32_sf : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B32:$src), (ins CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # ToName # ".f32">; + } + defm CVT_bf16 : CVT_FROM_FLOAT_SATFINITE<"bf16", B16>; + defm CVT_f16 : CVT_FROM_FLOAT_SATFINITE<"f16", B16>; // These cvts are different from those above: The source and dest registers // are of the same type. @@ -611,6 +620,11 @@ let hasSideEffects = false in { (ins B32:$src1, B32:$src2), (ins CvtMode:$mode), "cvt${mode:base}${mode:relu}." # FromName # ".f32">, Requires<[hasPTX<70>, hasSM<80>]>; + + def _f32_sf : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B32:$src1, B32:$src2), (ins CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">; } defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", B32>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index bcdb46eca9744..8fd014a09cc58 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -364,7 +364,42 @@ def INT_FENCE_SC_CLUSTER: NullaryInst<"fence.sc.cluster", int_nvvm_fence_sc_cluster>, Requires<[hasPTX<78>, hasSM<90>]>; +def INT_FENCE_MBARRIER_INIT_RELEASE_CLUSTER: + NullaryInst<"fence.mbarrier_init.release.cluster", + int_nvvm_fence_mbarrier_init_release_cluster>, + Requires<[hasPTX<80>, hasSM<90>]>; + +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_FENCE_ACQUIRE_SYNC_RESTRICT_CLUSTER_CLUSTER: + NullaryInst<"fence.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_FENCE_RELEASE_SYNC_RESTRICT_CTA_CLUSTER: + NullaryInst<"fence.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster>; +} + // Proxy fence (uni-directional) +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_ACQUIRE_SYNC_RESTRICT_SPACE_CLUSTER_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_RELEASE_SYNC_RESTRICT_SPACE_CTA_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster>; +} + +// Proxy fence (bi-directional) +foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Preds = !if(!eq(proxykind, "alias"), [hasPTX<75>, hasSM<70>], + [hasPTX<80>, hasSM<90>]); + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def : NullaryInst<"fence.proxy." # !subst("_", "::", proxykind), + !cast(Intr.record_name)>, Requires; +} + class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE : NullaryInst<"fence.proxy.tensormap::generic.release." # Scope, Intr>, Requires<[hasPTX<83>, hasSM<90>]>; @@ -1917,7 +1952,12 @@ def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, C def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>; def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ)>; def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>; - +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_ff2bf16x2_rn_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff2bf16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN_RELU)>; + def : Pat<(int_nvvm_ff2bf16x2_rz_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff2bf16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ_RELU)>; +} let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { def : Pat<(int_nvvm_ff2bf16x2_rs f32:$a, f32:$b, i32:$c), (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS)>; @@ -1933,6 +1973,12 @@ def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, Cvt def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>; def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ)>; def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>; +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_ff2f16x2_rn_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff2f16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN_RELU)>; + def : Pat<(int_nvvm_ff2f16x2_rz_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff2f16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ_RELU)>; +} let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { def : Pat<(int_nvvm_ff2f16x2_rs f32:$a, f32:$b, i32:$c), @@ -1948,6 +1994,23 @@ def : Pat<(int_nvvm_f2bf16_rn f32:$a), (CVT_bf16_f32 $a, CvtRN)>; def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>; def : Pat<(int_nvvm_f2bf16_rz f32:$a), (CVT_bf16_f32 $a, CvtRZ)>; def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a), (CVT_bf16_f32 $a, CvtRZ_RELU)>; +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_f2bf16_rz_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ)>; + def : Pat<(int_nvvm_f2bf16_rz_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ_RELU)>; + def : Pat<(int_nvvm_f2bf16_rn_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN)>; + def : Pat<(int_nvvm_f2bf16_rn_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN_RELU)>; +} + +def : Pat<(int_nvvm_f2f16_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>; +def : Pat<(int_nvvm_f2f16_rn_relu f32:$a), (CVT_f16_f32 $a, CvtRN_RELU)>; +def : Pat<(int_nvvm_f2f16_rz f32:$a), (CVT_f16_f32 $a, CvtRZ)>; +def : Pat<(int_nvvm_f2f16_rz_relu f32:$a), (CVT_f16_f32 $a, CvtRZ_RELU)>; +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_f2f16_rz_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ)>; + def : Pat<(int_nvvm_f2f16_rz_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ_RELU)>; + def : Pat<(int_nvvm_f2f16_rn_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN)>; + def : Pat<(int_nvvm_f2f16_rn_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN_RELU)>; +} def : Pat<(int_nvvm_lohi_i2d i32:$a, i32:$b), (V2I32toI64 $a, $b)>; def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L $a)>; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 220010c4d3d34..342532c2e52d2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -210,8 +210,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // setbc instruction. if (!Subtarget.hasP10Vector()) { setOperationAction(ISD::SSUBO, MVT::i32, Custom); - if (isPPC64) + setOperationAction(ISD::SADDO, MVT::i32, Custom); + if (isPPC64) { setOperationAction(ISD::SSUBO, MVT::i64, Custom); + setOperationAction(ISD::SADDO, MVT::i64, Custom); + } } // Match BITREVERSE to customized fast code sequence in the td file. @@ -12514,6 +12517,37 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues({Sub, OverflowTrunc}, dl); } +/// Implements signed add with overflow detection using the rule: +/// (x eqv y) & (sum xor x), where the overflow bit is extracted from the sign +SDValue PPCTargetLowering::LowerSADDO(SDValue Op, SelectionDAG &DAG) const { + + SDLoc dl(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getNode()->getValueType(0); + + SDValue Sum = DAG.getNode(ISD::ADD, dl, VT, LHS, RHS); + + // Compute ~(x xor y) + SDValue XorXY = DAG.getNode(ISD::XOR, dl, VT, LHS, RHS); + SDValue EqvXY = DAG.getNOT(dl, XorXY, VT); + // Compute (s xor x) + SDValue SumXorX = DAG.getNode(ISD::XOR, dl, VT, Sum, LHS); + + // overflow = (x eqv y) & (s xor x) + SDValue OverflowInSign = DAG.getNode(ISD::AND, dl, VT, EqvXY, SumXorX); + + // Shift sign bit down to LSB + SDValue Overflow = + DAG.getNode(ISD::SRL, dl, VT, OverflowInSign, + DAG.getConstant(VT.getSizeInBits() - 1, dl, MVT::i32)); + // Truncate to the overflow type (i1) + SDValue OverflowTrunc = + DAG.getNode(ISD::TRUNCATE, dl, Op.getNode()->getValueType(1), Overflow); + + return DAG.getMergeValues({Sum, OverflowTrunc}, dl); +} + // Lower unsigned 3-way compare producing -1/0/1. SDValue PPCTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); @@ -12565,6 +12599,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::SSUBO: return LowerSSUBO(Op, DAG); + case ISD::SADDO: + return LowerSADDO(Op, DAG); case ISD::INLINEASM: case ISD::INLINEASM_BR: return LowerINLINEASM(Op, DAG); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 680b529b4e2e5..74af055ed5d30 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -705,6 +705,7 @@ namespace llvm { SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSSUBO(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSADDO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 51e8e8574ed15..938c62e4f9f94 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -26,8 +26,16 @@ class RISCVRegisterBankInfo; class RISCVSubtarget; class RISCVTargetMachine; -FunctionPass *createRISCVCodeGenPreparePass(); -void initializeRISCVCodeGenPreparePass(PassRegistry &); +class RISCVCodeGenPreparePass : public PassInfoMixin { +private: + const RISCVTargetMachine *TM; + +public: + RISCVCodeGenPreparePass(const RISCVTargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; +FunctionPass *createRISCVCodeGenPrepareLegacyPass(); +void initializeRISCVCodeGenPrepareLegacyPassPass(PassRegistry &); FunctionPass *createRISCVDeadRegisterDefinitionsPass(); void initializeRISCVDeadRegisterDefinitionsPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index ce349598bd9b1..ab450f9c4a61d 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -33,20 +33,33 @@ using namespace llvm; #define PASS_NAME "RISC-V CodeGenPrepare" namespace { - -class RISCVCodeGenPrepare : public FunctionPass, - public InstVisitor { +class RISCVCodeGenPrepare : public InstVisitor { + Function &F; const DataLayout *DL; const DominatorTree *DT; const RISCVSubtarget *ST; +public: + RISCVCodeGenPrepare(Function &F, const DominatorTree *DT, + const RISCVSubtarget *ST) + : F(F), DL(&F.getDataLayout()), DT(DT), ST(ST) {} + bool run(); + bool visitInstruction(Instruction &I) { return false; } + bool visitAnd(BinaryOperator &BO); + bool visitIntrinsicInst(IntrinsicInst &I); + bool expandVPStrideLoad(IntrinsicInst &I); + bool widenVPMerge(IntrinsicInst &I); +}; +} // namespace + +namespace { +class RISCVCodeGenPrepareLegacyPass : public FunctionPass { public: static char ID; - RISCVCodeGenPrepare() : FunctionPass(ID) {} + RISCVCodeGenPrepareLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override; - StringRef getPassName() const override { return PASS_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -54,15 +67,8 @@ class RISCVCodeGenPrepare : public FunctionPass, AU.addRequired(); AU.addRequired(); } - - bool visitInstruction(Instruction &I) { return false; } - bool visitAnd(BinaryOperator &BO); - bool visitIntrinsicInst(IntrinsicInst &I); - bool expandVPStrideLoad(IntrinsicInst &I); - bool widenVPMerge(IntrinsicInst &I); }; - -} // end anonymous namespace +} // namespace // Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set, // but bits 63:32 are zero. If we know that bit 31 of X is 0, we can fill @@ -273,17 +279,7 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { return true; } -bool RISCVCodeGenPrepare::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto &TPC = getAnalysis(); - auto &TM = TPC.getTM(); - ST = &TM.getSubtarget(F); - - DL = &F.getDataLayout(); - DT = &getAnalysis().getDomTree(); - +bool RISCVCodeGenPrepare::run() { bool MadeChange = false; for (auto &BB : F) for (Instruction &I : llvm::make_early_inc_range(BB)) @@ -292,12 +288,40 @@ bool RISCVCodeGenPrepare::runOnFunction(Function &F) { return MadeChange; } -INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false) +bool RISCVCodeGenPrepareLegacyPass::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto &TPC = getAnalysis(); + auto &TM = TPC.getTM(); + auto ST = &TM.getSubtarget(F); + auto DT = &getAnalysis().getDomTree(); + + RISCVCodeGenPrepare RVCGP(F, DT, ST); + return RVCGP.run(); +} + +INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME, + false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false) +INITIALIZE_PASS_END(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME, false, + false) -char RISCVCodeGenPrepare::ID = 0; +char RISCVCodeGenPrepareLegacyPass::ID = 0; + +FunctionPass *llvm::createRISCVCodeGenPrepareLegacyPass() { + return new RISCVCodeGenPrepareLegacyPass(); +} -FunctionPass *llvm::createRISCVCodeGenPreparePass() { - return new RISCVCodeGenPrepare(); +PreservedAnalyses RISCVCodeGenPreparePass::run(Function &F, + FunctionAnalysisManager &FAM) { + DominatorTree *DT = &FAM.getResult(F); + auto ST = &TM->getSubtarget(F); + bool Changed = RISCVCodeGenPrepare(F, DT, ST).run(); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet(); + return PA; } diff --git a/llvm/lib/Target/RISCV/RISCVPassRegistry.def b/llvm/lib/Target/RISCV/RISCVPassRegistry.def new file mode 100644 index 0000000000000..29ccf2cff1ca5 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVPassRegistry.def @@ -0,0 +1,20 @@ +//===- RISCVPassRegistry.def - Registry of RISC-V passes --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is used as the registry of passes that are part of the RISC-V +// backend. +// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +#ifndef FUNCTION_PASS +#define FUNCTION_PASS(NAME, CREATE_PASS) +#endif +FUNCTION_PASS("riscv-codegenprepare", RISCVCodeGenPreparePass(this)) +#undef FUNCTION_PASS diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 926cc9ea547a6..f86265a21d17e 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -88,6 +88,8 @@ RISCVSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef CPU, if (TuneCPU.empty()) TuneCPU = CPU; + if (TuneCPU == "generic") + TuneCPU = Is64Bit ? "generic-rv64" : "generic-rv32"; TuneInfo = RISCVTuneInfoTable::getRISCVTuneInfo(TuneCPU); // If there is no TuneInfo for this CPU, we fail back to generic. diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 911bd7ee2876f..8f6c0af5de3b4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -123,7 +123,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVLateBranchOptPass(*PR); initializeRISCVMakeCompressibleOptPass(*PR); initializeRISCVGatherScatterLoweringPass(*PR); - initializeRISCVCodeGenPreparePass(*PR); + initializeRISCVCodeGenPrepareLegacyPassPass(*PR); initializeRISCVPostRAExpandPseudoPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVOptWInstrsPass(*PR); @@ -461,7 +461,7 @@ void RISCVPassConfig::addIRPasses() { addPass(createRISCVGatherScatterLoweringPass()); addPass(createInterleavedAccessPass()); - addPass(createRISCVCodeGenPreparePass()); + addPass(createRISCVCodeGenPrepareLegacyPass()); } TargetPassConfig::addIRPasses(); @@ -636,6 +636,9 @@ bool RISCVPassConfig::addILPOpts() { } void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +#define GET_PASS_REGISTRY "RISCVPassRegistry.def" +#include "llvm/Passes/TargetPassRegistry.inc" + PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM, OptimizationLevel Level) { if (Level != OptimizationLevel::O0) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index dca6e9cffebb0..1a1a93a9cb178 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1008,13 +1008,17 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( } InstructionCost -RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, +RISCVTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *Src = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + if (!isLegalMaskedLoadStore(Src, Alignment) || CostKind != TTI::TCK_RecipThroughput) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6886e8964e29e..39c1173e2986c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -144,8 +144,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool shouldConsiderVectorizationRegPressure() const override { return true; } InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1251a3ca8dbaa..aa9ba6b0e197c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7281,7 +7281,10 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, - bool IsAfterLegalize) { + bool IsAfterLegalize, + unsigned Depth = 0) { + if (Depth >= SelectionDAG::MaxRecursionDepth) + return SDValue(); // Limit search depth. if ((VT.getScalarSizeInBits() % 8) != 0) return SDValue(); @@ -7455,7 +7458,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); SDValue HalfLD = EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, - DAG, Subtarget, IsAfterLegalize); + DAG, Subtarget, IsAfterLegalize, Depth + 1); if (HalfLD) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), HalfLD, DAG.getVectorIdxConstant(0, DL)); @@ -7532,7 +7535,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, VT.getSizeInBits() / ScalarSize); if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( - RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) { + RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize, + Depth + 1)) { SDValue Broadcast = RepeatLoad; if (RepeatSize > ScalarSize) { while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0b1430e373fc7..4b77bf925b2ba 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5411,9 +5411,14 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, - unsigned AddressSpace, +X86TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *SrcTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + bool IsLoad = (Instruction::Load == Opcode); bool IsStore = (Instruction::Store == Opcode); @@ -6647,10 +6652,12 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( LegalVT.getVectorNumElements()); InstructionCost MemOpCost; bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; - if (UseMaskedMemOp) - MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, - AddressSpace, CostKind); - else + if (UseMaskedMemOp) { + unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load + : Intrinsic::masked_store; + MemOpCost = getMaskedMemoryOpCost( + {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind); + } else MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace, CostKind); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index de5e1c297b1e4..df1393ce16ca1 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -183,8 +183,7 @@ class X86TTIImpl final : public BasicTTIImplBase { TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 94663ff928a0b..fa35eef2c00f5 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1469,6 +1469,9 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( Constant::getNullValue(F->getType())); Value *Select = Builder.CreateSelect(ICmp, JT, Constant::getNullValue(F->getType())); + + if (auto *SI = dyn_cast(Select)) + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); // For phi nodes, we need to update the incoming value for all operands // with the same predecessor. if (PN) diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp index 8181e4ef1d74f..cf74354cb438f 100644 --- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp +++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp @@ -67,9 +67,10 @@ cl::opt ClFuncPrefix("alloc-token-prefix", cl::desc("The allocation function prefix"), cl::Hidden, cl::init("__alloc_token_")); -cl::opt ClMaxTokens("alloc-token-max", - cl::desc("Maximum number of tokens (0 = no max)"), - cl::Hidden, cl::init(0)); +cl::opt + ClMaxTokens("alloc-token-max", + cl::desc("Maximum number of tokens (0 = target SIZE_MAX)"), + cl::Hidden, cl::init(0)); cl::opt ClFastABI("alloc-token-fast-abi", diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c680b6fca84cd..1c53c606b271b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4428,11 +4428,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( assert(!isa(TC) && "Trip count SCEV must be computable"); const SCEV *KnownMinTC; bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())); + bool ScalableRemIter = false; // Use versions of TC and VF in which both are either scalable or fixed. - if (ScalableTC == MainLoopVF.isScalable()) + if (ScalableTC == MainLoopVF.isScalable()) { + ScalableRemIter = ScalableTC; RemainingIterations = SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); - else if (ScalableTC) { + } else if (ScalableTC) { const SCEV *EstimatedTC = SE.getMulExpr( KnownMinTC, SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1))); @@ -4456,6 +4458,9 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( << MaxTripCount << "\n"); } + auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool { + return SE.isKnownPredicate(CmpInst::ICMP_UGT, VF, RemIter); + }; for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) @@ -4473,11 +4478,17 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // If NextVF is greater than the number of remaining iterations, the // epilogue loop would be dead. Skip such factors. - if (RemainingIterations && !NextVF.Width.isScalable()) { - if (SE.isKnownPredicate( - CmpInst::ICMP_UGT, - SE.getConstant(TCType, NextVF.Width.getFixedValue()), - RemainingIterations)) + // TODO: We should also consider comparing against a scalable + // RemainingIterations when SCEV be able to evaluate non-canonical + // vscale-based expressions. + if (!ScalableRemIter) { + // Handle the case where NextVF and RemainingIterations are in different + // numerical spaces. + ElementCount EC = NextVF.Width; + if (NextVF.Width.isScalable()) + EC = ElementCount::getFixed( + estimateElementCount(NextVF.Width, CM.getVScaleForTuning())); + if (SkipVF(SE.getElementCount(TCType, EC), RemainingIterations)) continue; } @@ -5251,8 +5262,10 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, - CostKind); + unsigned IID = I->getOpcode() == Instruction::Load + ? Intrinsic::masked_load + : Intrinsic::masked_store; + Cost += TTI.getMaskedMemoryOpCost({IID, VectorTy, Alignment, AS}, CostKind); } else { TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index deb8ee2d88055..e33ff724ccdd5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6904,9 +6904,10 @@ static bool isMaskedLoadCompress( ScalarLoadsCost; InstructionCost LoadCost = 0; if (IsMasked) { - LoadCost = - TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace(), CostKind); + LoadCost = TTI.getMaskedMemoryOpCost({Intrinsic::masked_load, LoadVecTy, + CommonAlignment, + LI->getPointerAddressSpace()}, + CostKind); } else { LoadCost = TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, @@ -7305,8 +7306,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( break; case LoadsState::CompressVectorize: VecLdCost += TTI.getMaskedMemoryOpCost( - Instruction::Load, SubVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind) + + {Intrinsic::masked_load, SubVecTy, CommonAlignment, + LI0->getPointerAddressSpace()}, + CostKind) + VectorGEPCost + ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, {}, CostKind); @@ -15102,8 +15104,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, CommonAlignment, LI0->getPointerAddressSpace(), CostKind); } else if (IsMasked) { VecLdCost = TTI->getMaskedMemoryOpCost( - Instruction::Load, LoadVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind); + {Intrinsic::masked_load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace()}, + CostKind); // TODO: include this cost into CommonCost. VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index faa353cc5a6cc..13582f8bd2d62 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2065,14 +2065,12 @@ class LLVM_ABI_FOR_TEST VPHeaderPHIRecipe : public VPSingleDefRecipe, ~VPHeaderPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *B) { - return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC && - B->getVPDefID() <= VPDef::VPLastHeaderPHISC; + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() >= VPDef::VPFirstHeaderPHISC && + R->getVPDefID() <= VPDef::VPLastHeaderPHISC; } static inline bool classof(const VPValue *V) { - auto *B = V->getDefiningRecipe(); - return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC && - B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC; + return isa(V->getDefiningRecipe()); } /// Generate the phi nodes. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 94657f5d39390..7c9302860a3b5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -493,6 +493,10 @@ void VPSingleDefRecipe::dump() const { VPDef::dump(); } void VPRecipeBase::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { printRecipe(O, Indent, SlotTracker); + if (auto DL = getDebugLoc()) { + O << ", !dbg "; + DL.print(O); + } } #endif @@ -1481,11 +1485,6 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, printFlags(O); printOperands(O, SlotTracker); - - if (auto DL = getDebugLoc()) { - O << ", !dbg "; - DL.print(O); - } } #endif @@ -3592,8 +3591,10 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, InstructionCost Cost = 0; if (IsMasked) { + unsigned IID = isa(this) ? Intrinsic::masked_load + : Intrinsic::masked_store; Cost += - Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); + Ctx.TTI.getMaskedMemoryOpCost({IID, Ty, Alignment, AS}, Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( isa(this) ? getOperand(0) @@ -3711,8 +3712,10 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); + // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics. + // After migrating to getMemIntrinsicInstrCost, switch this to vp_load. InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); + {Intrinsic::masked_load, Ty, Alignment, AS}, Ctx.CostKind); if (!Reverse) return Cost; @@ -3820,8 +3823,10 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); + // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics. + // After migrating to getMemIntrinsicInstrCost, switch this to vp_store. InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); + {Intrinsic::masked_store, Ty, Alignment, AS}, Ctx.CostKind); if (!Reverse) return Cost; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge-undef.mir new file mode 100644 index 0000000000000..fe1986e44051b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge-undef.mir @@ -0,0 +1,72 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -o - -mtriple=aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s | FileCheck %s +--- +name: non-vector-src +legalized: true +body: | + bb.0: + liveins: $w0 + + ; CHECK-LABEL: name: non-vector-src + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[UV]](s8), [[UV1]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8) + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<8 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<8 x s8>), [[DEF1]], shufflemask(1, 0, 1, 0, undef, undef, undef, undef) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[SHUF]](<8 x s8>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>) + ; CHECK-NEXT: $d0 = COPY [[UV4]](<4 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 + %1:_(s32) = COPY $w0 + %30:_(s8), %31:_(s8), %32:_(s8), %33:_(s8) = G_UNMERGE_VALUES %1(s32) + %14:_(s8) = G_IMPLICIT_DEF + %15:_(<8 x s8>) = G_BUILD_VECTOR %30(s8), %31(s8), %14(s8), %14(s8), %14(s8), %14(s8), %14(s8), %14(s8) + %20:_(<8 x s8>) = G_BUILD_VECTOR %14(s8), %14(s8), %14(s8), %14(s8), %14(s8), %14(s8), %14(s8), %14(s8) + %21:_(<8 x s8>) = G_SHUFFLE_VECTOR %15(<8 x s8>), %20, shufflemask(1, 0, 1, 0, undef, undef, undef, undef) + %41:_(<8 x s16>) = G_ANYEXT %21(<8 x s8>) + %50:_(<4 x s16>), %51:_(<4 x s16>) = G_UNMERGE_VALUES %41(<8 x s16>) + $d0 = COPY %50(<4 x s16>) + RET_ReallyLR implicit $d0 +... +--- +name: v2-src +legalized: true +body: | + bb.0: + liveins: $q0 + + ; CHECK-LABEL: name: v2-src + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[COPY]](<2 x s32>), [[DEF]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit [[CONCAT_VECTORS]](<4 x s32>) + %1:_(<2 x s32>) = COPY $x0 + %30:_(s32), %31:_(s32) = G_UNMERGE_VALUES %1(<2 x s32>) + %14:_(s32) = G_IMPLICIT_DEF + %15:_(<4 x s32>) = G_BUILD_VECTOR %30(s32), %31(s32), %14(s32), %14(s32) + RET_ReallyLR implicit %15 +... +--- +name: v4-src +legalized: true +body: | + bb.0: + liveins: $q0 + + ; CHECK-LABEL: name: v4-src + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $x0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit [[CONCAT_VECTORS]](<8 x s16>) + %1:_(<4 x s16>) = COPY $x0 + %30:_(s16), %31:_(s16), %32:_(s16), %33:_(s16) = G_UNMERGE_VALUES %1(<4 x s16>) + %14:_(s16) = G_IMPLICIT_DEF + %15:_(<8 x s16>) = G_BUILD_VECTOR %30(s16), %31(s16), %32(s16), %33(s16), %14(s16), %14(s16), %14(s16), %14(s16) + RET_ReallyLR implicit %15 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll new file mode 100644 index 0000000000000..4255d03c158e1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_ps void @uniform_fshr_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt, ptr %resptr) { +; CHECK-LABEL: uniform_fshr_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_alignbit_b32 v2, s0, s1, v2 +; CHECK-NEXT: v_readfirstlane_b32 s0, v2 +; CHECK-NEXT: s_add_co_i32 s0, s0, s0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: flat_store_b32 v[0:1], v2 +; CHECK-NEXT: s_endpgm + %vres = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) + %add = add i32 %vres, %vres + store i32 %add, ptr %resptr + ret void +} + +declare i32 @llvm.amdgcn.readfirstlane.i32(i32) + +define amdgpu_ps void @divergent_fshr_i32(i32 %lhs, i32 %rhs, i32 %amt, ptr %resptr) { +; CHECK-LABEL: divergent_fshr_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; CHECK-NEXT: flat_store_b32 v[3:4], v0 +; CHECK-NEXT: s_endpgm + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) + store i32 %result, ptr %resptr + ret void +} + +declare i32 @llvm.fshr.i32(i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 9c49aade6099f..43d8271de2a43 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -24,96 +24,82 @@ ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) -; FIXME the offset here should not be folded: if %p points to the beginning of +; The offset here cannot be folded: if %p points to the beginning of scratch or ; scratch or LDS and %i is -1, a folded offset crashes the program. define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { -; GFX90A-SDAG-LABEL: flat_offset_maybe_oob: -; GFX90A-SDAG: ; %bb.0: -; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: flat_offset_maybe_oob: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_offset_maybe_oob: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_offset_maybe_oob: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-SDAG-LABEL: flat_offset_maybe_oob: ; GFX942-SDAG: ; %bb.0: ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 12 +; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: flat_offset_maybe_oob: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: flat_offset_maybe_oob: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-GISEL-LABEL: flat_offset_maybe_oob: -; GFX90A-GISEL: ; %bb.0: -; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: flat_offset_maybe_oob: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_offset_maybe_oob: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-LABEL: flat_offset_maybe_oob: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-GISEL-LABEL: flat_offset_maybe_oob: ; GFX942-GISEL: ; %bb.0: @@ -126,44 +112,6 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX942-GISEL-NEXT: flat_load_dword v0, v[0:1] ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_offset_maybe_oob: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: flat_offset_maybe_oob: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx %l = load i32, ptr %arrayidx @@ -273,13 +221,735 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { %l = load i32, ptr addrspace(5) %arrayidx ret i32 %l } + +; If the GEP that adds the offset is inbounds, folding the offset is legal. +define i32 @flat_offset_inbounds(ptr %p, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load i32, ptr %arrayidx + ret i32 %l +} + +define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX90A-SDAG: ; %bb.0: +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 28, v0 +; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-SDAG-NEXT: flat_load_dword v10, v[4:5] +; GFX90A-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] offset:12 +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 +; GFX90A-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: flat_load_dword v8, v[4:5] +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-SDAG-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX942-SDAG: ; %bb.0: +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 28 +; GFX942-SDAG-NEXT: flat_load_dword v10, v[4:5] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[6:9], v[0:1] offset:12 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: flat_load_b32 v8, v[4:5] +; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: flat_load_b32 v8, v[4:5] +; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] offset:28 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-GISEL-NEXT: flat_store_dword v[2:3], v0 offset:16 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <5 x i32>, ptr %arrayidx + store <5 x i32> %l, ptr %pout + ret void +} + +define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-SDAG-MUBUF-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-SDAG-MUBUF: ; %bb.0: +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-MUBUF-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v6, s[4:5], 28, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v7, s[4:5], 0, v1, s[4:5] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v8, s[4:5], 44, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v1, s[4:5] +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[30:33], v[6:7] +; GFX90A-SDAG-MUBUF-NEXT: s_nop 0 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[6:9], v[8:9] +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[48:51], v[4:5] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[34:37] +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dword v[4:5], v50 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-SDAG-FLATSCR-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-SDAG-FLATSCR: ; %bb.0: +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v6, s[0:1], 28, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v8, s[0:1], 44, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x8c, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[10:13], v[8:9] offset:16 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[14:17], v[8:9] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[18:21], v[8:9] offset:48 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[22:25], v[8:9] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[26:29], v[8:9] offset:80 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[30:33], v[6:7] +; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[6:9], v[8:9] +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[34:37], v[0:1] offset:12 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[48:51], v[4:5] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, 0x88, v2 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:48 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[14:17] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[6:9] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[30:33] offset:16 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[34:37] +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dword v[4:5], v50 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: s_clause 0x8 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[36:37] offset:80 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[36:37] offset:96 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[36:37] offset:48 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[36:37] offset:64 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[36:37] offset:16 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[36:37] offset:32 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[36:37] +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[36:39], v[36:37] offset:112 +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:48 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:32 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dword v[48:49], v38 +; GFX10-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[36:37] offset:128 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX942-SDAG: ; %bb.0: +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x5c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x4c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x7c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[14:15], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x6c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[16:17], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 28 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, 60 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, 44 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[18:21], v[16:17] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[22:25], v[12:13] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[26:29], v[14:15] +; GFX942-SDAG-NEXT: ; kill: killed $vgpr12_vgpr13 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr14_vgpr15 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr16_vgpr17 +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[8:9] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[30:33], v[10:11] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[34:37], v[4:5] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[48:51], v[6:7] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[52:55], v[0:1] offset:12 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x8c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-SDAG-NEXT: flat_load_dwordx4 a[0:3], v[0:1] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x60 +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x50 +; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x88 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, s[0:1] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, s[2:3] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[2:3], 0, s[4:5] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[2:3], 0, 48 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[2:3], 0, s[6:7] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[26:29] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[22:25] offset:64 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[6:7], v[30:33] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[8:9], v[48:51] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[52:55] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[34:37] offset:16 +; GFX942-SDAG-NEXT: flat_store_dword v[10:11], a2 +; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], a[0:1] offset:128 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x7 +; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 +; GFX11-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 +; GFX11-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 +; GFX11-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 +; GFX11-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 +; GFX11-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] +; GFX11-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX11-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x7 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX11-SDAG-NEXT: flat_store_b32 v[48:49], v34 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 +; GFX11-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x7 +; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 +; GFX12-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 +; GFX12-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 +; GFX12-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 +; GFX12-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 +; GFX12-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] +; GFX12-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX12-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x7 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX12-SDAG-NEXT: flat_store_b32 v[48:49], v34 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 +; GFX12-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX90A-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX90A-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-GISEL-NEXT: s_clause 0x8 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX10-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX942-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX942-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-GISEL-NEXT: s_clause 0x8 +; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 +; GFX11-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 +; GFX11-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 +; GFX11-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 +; GFX11-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 +; GFX11-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 +; GFX11-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 +; GFX11-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x8 +; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 +; GFX12-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 +; GFX12-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 +; GFX12-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 +; GFX12-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 +; GFX12-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 +; GFX12-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 +; GFX12-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x808 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x708 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x608 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x508 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x408 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x308 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x208 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x108 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <35 x i32>, ptr %arrayidx + store <35 x i32> %l, ptr %pout + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} ; GFX10-GISEL-FLATSCR: {{.*}} ; GFX10-MUBUF: {{.*}} ; GFX10-SDAG-FLATSCR: {{.*}} -; GFX12: {{.*}} -; GFX90A: {{.*}} ; GFX90A-GISEL-FLATSCR: {{.*}} ; GFX90A-MUBUF: {{.*}} -; GFX90A-SDAG-FLATSCR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index bd11b0710fadd..36df710529599 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -13,9 +13,9 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol @@ -40,9 +40,9 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol @@ -71,11 +71,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: s_add_u32 s0, s0, -8 ; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1 +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 4ad161c03f5b7..2ff69d234455f 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -12,18 +12,22 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-NEXT: s_mov_b32 s5, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-NEXT: .LBB0_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] ; GFX12-NEXT: s_cbranch_scc1 .LBB0_2 @@ -38,17 +42,20 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-SPREFETCH-NEXT: s_mov_b32 s5, -1 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe -; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -410,10 +417,14 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-NEXT: .LBB4_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX12-NEXT: s_add_co_i32 s0, s0, -1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 @@ -448,10 +459,14 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0 @@ -466,15 +481,17 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; ; GFX1250-LABEL: copy_flat_divergent: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3 ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff50 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1] ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1] @@ -482,13 +499,13 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB4_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[2:3] ; GFX1250-NEXT: flat_prefetch_b8 v[2:3] scope:SCOPE_SE -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3] -; GFX1250-NEXT: s_add_co_i32 s0, s0, -1 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_add_co_i32 s2, s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[4:7] ; GFX1250-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index c28856b16dd97..0ceba78a4eccc 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -16,62 +16,71 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB0_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x4 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128 +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo +; CHECK-NEXT: s_clause 0xa +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %memcpy-split @@ -81,621 +90,656 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-LABEL: memcpy_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[55:58], v[66:67] +; ALIGNED-NEXT: flat_load_dwordx4 v[59:62], v[68:69] +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[70:71] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v63 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v63 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v72 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v72 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v33 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:33 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v52 offset:16 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v82 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 +; ALIGNED-NEXT: flat_store_byte v[24:25], v81 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v80 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v80 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:237 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:221 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:205 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:201 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:197 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:193 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:189 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:185 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:181 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:177 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:173 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v72 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -704,25 +748,34 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x10 ; 68-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -1549,39 +1602,41 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] -; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %memcpy-split @@ -1594,27 +1649,31 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[8:9], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[8:9], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[8:9], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[8:9], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[8:9], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[8:9], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[8:9], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[8:9], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[8:9], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[8:9], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[8:9], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[8:9], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[8:9], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[8:9], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[8:9], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[8:9], off offset:16 +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) @@ -1622,466 +1681,465 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 +; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 +; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB2_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -3599,38 +3657,38 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 ; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 ; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:160 ; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 ; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 ; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 @@ -3651,29 +3709,31 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(43) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(40) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -3757,20 +3817,20 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 @@ -3779,11 +3839,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 @@ -3800,64 +3860,66 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 ; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_clause 0x33 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x35 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:164 ; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:181 ; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:188 ; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:191 ; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:202 ; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill @@ -3880,14 +3942,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27 +; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -3898,16 +3960,16 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 @@ -3929,49 +3991,55 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:217 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:220 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:223 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill @@ -3979,83 +4047,91 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:226 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229 ; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 ; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 ; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0xc +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:244 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v90, v90, 8, v121 +; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill @@ -4281,7 +4357,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -4312,198 +4388,182 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v96 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v12 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 -; ALIGNED-NEXT: s_clause 0x3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v14, 8, v18 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v97 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v110 +; ALIGNED-NEXT: v_lshl_or_b32 v79, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v82 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v120, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v72, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v70 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 -; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v104, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v53 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v51, 8, v54 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76 +; ALIGNED-NEXT: v_lshl_or_b32 v114, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v98, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 -; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v81, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v34 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 -; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 -; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v22 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v48, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v16, 8, v15 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v20 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 -; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 -; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v100, 8, v101 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v107, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v7, 8, v8 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v9, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v89, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v121 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v109, 8, v106 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v122, 8, v111 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 -; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v90, v106, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v127, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v111, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v121, 8, v109 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v126, v73, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v89, 8, v106 ; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4512,84 +4572,86 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v0, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:224 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v3, 3 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v4, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:247 +; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:246 +; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:252 +; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:250 +; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:251 +; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:249 +; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:245 +; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:239 +; ALIGNED-NEXT: flat_store_byte v[5:6], v16 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:238 +; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:244 +; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:241 +; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:237 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:231 +; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:232 +; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:230 +; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:236 +; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:234 +; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:235 +; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:233 +; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:229 +; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:223 +; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:222 +; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:228 +; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:221 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:210 +; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:206 +; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:207 +; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:211 +; ALIGNED-NEXT: flat_store_byte v[5:6], v80 offset:209 +; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:215 +; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:216 +; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:214 +; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:220 +; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:218 +; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:217 +; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:205 +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:208 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:199 +; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:200 +; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:198 +; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:204 +; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:203 +; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:201 +; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:197 +; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:191 +; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:192 +; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:190 +; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:194 +; ALIGNED-NEXT: flat_store_byte v[5:6], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:189 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4604,22 +4666,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:183 +; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:184 +; ALIGNED-NEXT: flat_store_byte v[5:6], v118 offset:182 +; ALIGNED-NEXT: flat_store_byte v[5:6], v41 offset:188 +; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:186 +; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:187 +; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:185 +; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:181 +; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:175 +; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:174 +; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:180 +; ALIGNED-NEXT: flat_store_byte v[5:6], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:179 +; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:173 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload @@ -4632,22 +4694,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:167 +; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:166 +; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:172 +; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:170 +; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:171 +; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:169 +; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:165 +; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v91 offset:164 +; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:163 +; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:161 +; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:157 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload @@ -4660,43 +4722,43 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 @@ -4711,49 +4773,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 @@ -4771,52 +4833,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 @@ -4831,52 +4893,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 @@ -4891,52 +4953,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:88 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:92 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:90 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:89 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 @@ -4951,49 +5013,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 @@ -5011,52 +5073,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:57 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:53 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:46 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:52 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 @@ -5071,56 +5133,56 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -5129,46 +5191,46 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:18 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:15 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:17 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:16 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 @@ -5178,42 +5240,42 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 +; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:7 +; ALIGNED-NEXT: flat_store_byte v[5:6], v121 offset:8 +; ALIGNED-NEXT: flat_store_byte v[5:6], v127 offset:10 +; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:9 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:15 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9 ; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload @@ -5346,62 +5408,68 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v52, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v53, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[52:53] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[52:53] +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v52, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v53, vcc_lo +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v52, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v53, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v52 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v53, vcc_lo +; CHECK-NEXT: s_clause 0xd +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[52:53] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[52:53] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[52:53] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_cbranch_scc1 .LBB5_2 ; CHECK-NEXT: .LBB5_3: ; %Flow5 ; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -5412,62 +5480,71 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_mov_b32 s7, -1 ; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x4 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128 +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo +; CHECK-NEXT: s_clause 0xa +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 ; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 ; CHECK-NEXT: .LBB5_6: ; %Flow6 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -5477,14 +5554,23 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-LABEL: memmove_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 @@ -5493,609 +5579,635 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[98:101], v[20:21] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[20:21] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[20:21] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v38, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v39, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v80, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[38:39] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[70:71] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[80:81] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v56 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v56 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v57 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v57 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v27 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:33 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v30 offset:16 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v36 offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 +; ALIGNED-NEXT: flat_store_byte v[24:25], v35 offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v34 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v34 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:237 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:221 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:205 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:156 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:201 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:197 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:193 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:189 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:185 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v80 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:181 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:173 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v57 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v57 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v34 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:140 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:80 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v33 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v32 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v31 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v30 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:233 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v49 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v38 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v53 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v65 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v64 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v80 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v71 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v85 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:185 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v82 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6104,14 +6216,14 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_2 ; ALIGNED-NEXT: .LBB5_3: ; %Flow5 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -6121,610 +6233,636 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop -; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[55:58], v[66:67] +; ALIGNED-NEXT: flat_load_dwordx4 v[59:62], v[68:69] +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[70:71] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v63 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v63 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v72 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v72 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:588 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:588 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v33 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:33 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v52 offset:16 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v82 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 +; ALIGNED-NEXT: flat_store_byte v[24:25], v81 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v80 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v80 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:237 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:221 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:205 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:201 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:197 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:193 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:189 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:185 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:181 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:173 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v72 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6733,26 +6871,35 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %Flow6 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x10 ; 68-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -8369,34 +8516,36 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 ; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) @@ -8435,34 +8584,36 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 ; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) @@ -8489,11 +8640,11 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -8508,473 +8659,477 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v100 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:254 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:250 +; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:244 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v117 offset:242 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v118 offset:240 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v119 offset:238 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:240 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:221 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:243 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:188 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:186 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:180 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:178 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:174 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:141 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:146 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:142 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:126 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:92 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:90 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:78 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:77 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:62 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 ; ALIGNED-NEXT: .LBB7_3: ; %Flow6 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -8987,11 +9142,11 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -9006,6 +9161,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) @@ -9013,465 +9172,465 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 +; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 +; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 ; ALIGNED-NEXT: .LBB7_6: ; %Flow7 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -12450,25 +12609,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:224 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -12554,29 +12715,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -12653,21 +12816,21 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 @@ -12683,23 +12846,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 @@ -12707,83 +12870,84 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_clause 0x30 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x31 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 ; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:175 ; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:181 ; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:188 ; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:191 ; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 @@ -12793,8 +12957,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -12804,150 +12968,149 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v34, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 -; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(61) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:211 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:226 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:229 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:231 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:237 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:233 ; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0xc -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 ; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:244 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 ; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:254 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(43) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(41) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill @@ -12955,25 +13118,20 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 +; ALIGNED-NEXT: s_clause 0x8 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:220 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:223 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -12997,37 +13155,37 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 @@ -13134,7 +13292,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -13171,9 +13329,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -13186,7 +13344,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 @@ -13195,31 +13353,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -13237,446 +13395,452 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v98, 8, v100 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v13 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 -; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v15, 8, v20 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v121, v9, 8, v7 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v86 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v84 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v82 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v81 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v66 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v79, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v51 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v102, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v85, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v38 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v59 +; ALIGNED-NEXT: v_lshl_or_b32 v80, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v54, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 -; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v72 +; ALIGNED-NEXT: v_lshl_or_b32 v52, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v23 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v45, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v22 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v40 +; ALIGNED-NEXT: v_lshl_or_b32 v24, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v14 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v42, 8, v43 +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v16, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v113 +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v121, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v116, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v108, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v103, 8, v114 +; ALIGNED-NEXT: v_lshl_or_b32 v92, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v121, v121, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v122, v0, 8, v1 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v121, v5, 8, v124 +; ALIGNED-NEXT: v_lshl_or_b32 v122, v4, 8, v125 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v121, v4, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_mov_b32_e32 v4, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v122, v5, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v5, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload -; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v95, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:240 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:224 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:208 -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v127, v1, 8, v125 ; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_add_co_u32 v121, vcc_lo, v5, s4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v122, null, s5, v6, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v121, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v122, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247 +; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246 +; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:252 +; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250 +; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:251 +; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:249 +; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245 +; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:239 +; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:238 +; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:244 +; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:242 +; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:243 +; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:241 +; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:237 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:231 +; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:232 +; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:230 +; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:236 +; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:234 +; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:235 +; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:233 +; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:229 +; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:223 +; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:224 +; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:222 +; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:228 +; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:226 +; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:227 +; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:225 +; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:221 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:210 +; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:212 +; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:206 +; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:207 +; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:211 +; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:209 +; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:215 +; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:216 +; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214 +; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:220 +; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:218 +; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:219 +; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:217 +; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:213 +; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:205 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:199 +; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:200 +; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:198 +; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:204 +; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:202 +; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:203 +; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:201 +; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:197 +; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:191 +; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:192 +; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:190 +; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:194 +; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:195 +; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:193 +; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:189 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:183 +; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:184 +; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:182 +; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:188 +; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:186 +; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:187 +; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:185 +; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:181 +; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:175 +; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:174 +; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:180 +; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:178 +; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:179 +; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:177 +; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:173 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:167 +; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:168 +; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:166 +; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:172 +; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:170 +; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:171 +; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:169 +; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:165 +; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:164 +; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:162 +; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:161 +; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:157 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v126 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 @@ -13691,52 +13855,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 @@ -13751,112 +13915,112 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:88 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:92 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:90 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:89 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 @@ -13871,52 +14035,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 @@ -13929,61 +14093,61 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:57 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:53 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:46 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:52 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload @@ -13991,52 +14155,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:39 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:44 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:41 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:33 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 @@ -14049,288 +14213,288 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:17 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[121:122], v125 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:7 +; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:10 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:6 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:12 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:8 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:2 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:1 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1 ; ALIGNED-NEXT: .LBB9_2: ; %Flow10 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_5 ; ALIGNED-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0x700, v2 +; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0x700, v2 ; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 -; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v6, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:161 ; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v6, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v6, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v74, v6, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v61, v6, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v6, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v6, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v6, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v6, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v6, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v6, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v6, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v44, v6, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v6, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v6, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v6, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v115, v6, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v6, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v6, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v6, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v6, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v6, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v6, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27 +; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v39 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v49, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v51, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill @@ -14340,57 +14504,57 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:211 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:223 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:210 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill @@ -14400,82 +14564,93 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:235 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:234 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0xc +; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:251 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v120 +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v6, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(44) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -14484,48 +14659,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14533,13 +14708,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14550,48 +14725,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14599,13 +14774,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14616,48 +14791,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:135 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14665,13 +14840,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:132 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14682,48 +14857,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:143 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14731,281 +14906,269 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v12 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v18 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v100, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 -; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v94 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 -; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v104 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 -; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 -; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 -; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 -; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 -; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 -; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v107, v2, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v2, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen +; ALIGNED-NEXT: v_lshl_or_b32 v79, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v72, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v23, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v11, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 8, v8 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v9, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v88, 16, v5 +; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v62, v62, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v88, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v109, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v121, 8, v110 ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v126, 8, v89 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v88, v125, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v110, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v120, 8, v109 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 +; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v88 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v125, v76, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v89, 8, v93 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[2:3], v1 offset:250 -; ALIGNED-NEXT: flat_store_byte v[2:3], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249 -; ALIGNED-NEXT: flat_store_byte v[2:3], v8 offset:255 -; ALIGNED-NEXT: flat_store_byte v[2:3], v9 offset:253 -; ALIGNED-NEXT: flat_store_byte v[2:3], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[2:3], v11 offset:252 -; ALIGNED-NEXT: flat_store_byte v[2:3], v6 offset:248 -; ALIGNED-NEXT: flat_store_byte v[2:3], v13 offset:242 -; ALIGNED-NEXT: flat_store_byte v[2:3], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[2:3], v17 offset:241 -; ALIGNED-NEXT: flat_store_byte v[2:3], v12 offset:247 -; ALIGNED-NEXT: flat_store_byte v[2:3], v15 offset:245 -; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 -; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v3, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:247 +; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:246 +; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:252 +; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:250 +; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:251 +; ALIGNED-NEXT: flat_store_byte v[4:5], v13 offset:249 +; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:245 +; ALIGNED-NEXT: flat_store_byte v[4:5], v15 offset:239 +; ALIGNED-NEXT: flat_store_byte v[4:5], v16 offset:240 +; ALIGNED-NEXT: flat_store_byte v[4:5], v19 offset:238 +; ALIGNED-NEXT: flat_store_byte v[4:5], v14 offset:244 +; ALIGNED-NEXT: flat_store_byte v[4:5], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[4:5], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[4:5], v20 offset:241 +; ALIGNED-NEXT: flat_store_byte v[4:5], v21 offset:237 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:233 -; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:237 -; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:236 -; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:226 -; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:225 -; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:231 -; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:229 -; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:228 -; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:224 +; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:231 +; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:232 +; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:230 +; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:236 +; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:234 +; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:235 +; ALIGNED-NEXT: flat_store_byte v[4:5], v29 offset:233 +; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:229 +; ALIGNED-NEXT: flat_store_byte v[4:5], v31 offset:223 +; ALIGNED-NEXT: flat_store_byte v[4:5], v32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[4:5], v35 offset:222 +; ALIGNED-NEXT: flat_store_byte v[4:5], v30 offset:228 +; ALIGNED-NEXT: flat_store_byte v[4:5], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[4:5], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[4:5], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[4:5], v37 offset:221 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448 ; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:456 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:209 -; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:211 -; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:210 -; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:214 -; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:212 -; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:219 -; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:223 -; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:222 -; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:220 -; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:216 -; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: flat_store_byte v[4:5], v67 offset:210 +; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:206 +; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:208 +; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:207 +; ALIGNED-NEXT: flat_store_byte v[4:5], v70 offset:211 +; ALIGNED-NEXT: flat_store_byte v[4:5], v80 offset:209 +; ALIGNED-NEXT: flat_store_byte v[4:5], v53 offset:215 +; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:216 +; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:214 +; ALIGNED-NEXT: flat_store_byte v[4:5], v51 offset:220 +; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:218 +; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:217 +; ALIGNED-NEXT: flat_store_byte v[4:5], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[4:5], v49 offset:205 +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:464 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 -; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 -; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:207 -; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:205 -; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:204 -; ALIGNED-NEXT: flat_store_byte v[2:3], v84 offset:200 -; ALIGNED-NEXT: flat_store_byte v[2:3], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:195 -; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:193 -; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:199 -; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 -; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 -; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 +; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:199 +; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:200 +; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:198 +; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:204 +; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:203 +; ALIGNED-NEXT: flat_store_byte v[4:5], v97 offset:201 +; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:197 +; ALIGNED-NEXT: flat_store_byte v[4:5], v101 offset:191 +; ALIGNED-NEXT: flat_store_byte v[4:5], v100 offset:192 +; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:190 +; ALIGNED-NEXT: flat_store_byte v[4:5], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[4:5], v103 offset:194 +; ALIGNED-NEXT: flat_store_byte v[4:5], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[4:5], v115 offset:189 ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] @@ -15021,22 +15184,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 -; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 -; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:191 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:190 -; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:188 -; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:184 -; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:178 -; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:177 -; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:183 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:181 -; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 -; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 +; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:183 +; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:184 +; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:182 +; ALIGNED-NEXT: flat_store_byte v[4:5], v41 offset:188 +; ALIGNED-NEXT: flat_store_byte v[4:5], v43 offset:186 +; ALIGNED-NEXT: flat_store_byte v[4:5], v42 offset:187 +; ALIGNED-NEXT: flat_store_byte v[4:5], v44 offset:185 +; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:181 +; ALIGNED-NEXT: flat_store_byte v[4:5], v47 offset:175 +; ALIGNED-NEXT: flat_store_byte v[4:5], v56 offset:176 +; ALIGNED-NEXT: flat_store_byte v[4:5], v59 offset:174 +; ALIGNED-NEXT: flat_store_byte v[4:5], v46 offset:180 +; ALIGNED-NEXT: flat_store_byte v[4:5], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[4:5], v58 offset:179 +; ALIGNED-NEXT: flat_store_byte v[4:5], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[4:5], v61 offset:173 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload @@ -15048,23 +15211,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:175 -; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:174 -; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:172 -; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:162 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:163 -; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 -; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:167 +; ALIGNED-NEXT: flat_store_byte v[4:5], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:166 +; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:172 +; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:170 +; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:171 +; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:169 +; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:165 +; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:159 +; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:160 +; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:158 +; ALIGNED-NEXT: flat_store_byte v[4:5], v91 offset:164 +; ALIGNED-NEXT: flat_store_byte v[4:5], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[4:5], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:161 +; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:157 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload @@ -15076,44 +15239,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:151 +; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:152 +; ALIGNED-NEXT: flat_store_byte v[4:5], v122 offset:150 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:156 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:158 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:155 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[2:3], v123 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:144 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:148 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:145 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 @@ -15128,49 +15293,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:138 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:135 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:143 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:138 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:142 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:130 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:129 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:134 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:131 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:128 @@ -15188,52 +15353,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 @@ -15248,52 +15413,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 @@ -15308,52 +15473,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:90 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:88 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:89 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:95 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:92 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:90 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:92 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:89 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 @@ -15368,49 +15533,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:64 @@ -15428,52 +15593,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:63 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:57 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:56 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:53 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:49 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:46 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:55 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:52 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 @@ -15488,149 +15653,147 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:640 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:18 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:15 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:17 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[4:5], v89 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:16 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:16 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 +; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:7 +; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:8 +; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:10 +; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:6 +; ALIGNED-NEXT: flat_store_byte v[4:5], v126 offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:11 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:9 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll index 9713689217cf7..db82530f66aa4 100644 --- a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll @@ -49,16 +49,18 @@ define <3 x i64> @v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dword v4, v[0:1] offset:20 -; CHECK-NEXT: flat_load_dword v6, v[2:3] offset:16 +; CHECK-NEXT: flat_load_dword v4, v[2:3] offset:16 ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] -; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: v_mov_b32_e32 v3, -1 +; CHECK-NEXT: flat_load_dword v1, v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v4, v6, v4 ; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5 ; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7 +; CHECK-NEXT: v_ashrrev_i32_e32 v4, v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: v_mov_b32_e32 v5, -1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i64>, ptr %arg0.ptr, !range !4, !noundef !{} diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll index 7e4be65898b65..587e454da884c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -11,63 +11,73 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v ; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base ; CHECK-NEXT: s_movk_i32 s34, 0x80 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v21, s35 +; CHECK-NEXT: s_add_nc_u64 s[44:45], s[34:35], 0x70 +; CHECK-NEXT: v_dual_mov_b32 v26, s34 :: v_dual_mov_b32 v27, s35 +; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 ; CHECK-NEXT: s_wait_kmcnt 0x0 ; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 ; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 ; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37 ; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39 +; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; CHECK-NEXT: s_add_nc_u64 s[24:25], s[34:35], 0x60 ; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29 ; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31 -; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; CHECK-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21 +; CHECK-NEXT: s_add_nc_u64 s[20:21], s[34:35], 0x50 +; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v23, s25 ; CHECK-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20 ; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:112 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:96 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] offset:80 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[24:25], v[16:19] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; CHECK-NEXT: s_add_nc_u64 s[12:13], s[34:35], 48 ; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 ; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 -; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 ; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12 ; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 ; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; CHECK-NEXT: flat_store_b128 v[20:21], v[0:3] offset:64 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[4:7] offset:48 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:32 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[8:11] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:16 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[12:15] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[16:19] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:96 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:112 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:64 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:80 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:32 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:48 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:16 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 4b03896043dbb..23468a128285d 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -304,62 +304,72 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: ; use s29 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: s_mov_b32 s32, s33 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 +; GFX906-NEXT: s_waitcnt vmcnt(33) +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x70, v2 +; GFX906-NEXT: s_waitcnt vmcnt(32) +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[32:35] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x60, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1 @@ -676,57 +686,67 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: ; use s29 ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: s_waitcnt vmcnt(33) +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x70, v2 +; GFX908-NEXT: s_waitcnt vmcnt(32) +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[32:35] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x60, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, 3 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir index 93f7bcc478737..30cc241b55271 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir @@ -1,16 +1,32 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s +# Ensure syncscope IDs defined in expected order +# SSID 2 = workgroup +# SSID 3 = wavefront +--- | + define amdgpu_cs void @test_workgroup() { + fence syncscope("workgroup") acq_rel + fence syncscope("wavefront") acq_rel + ret void + } + define amdgpu_cs void @test_wavefront() { + fence syncscope("workgroup") acq_rel + fence syncscope("wavefront") acq_rel + ret void + } +... + # Ensure WMMA operations stay before the final atomic fence and barrier group. # This allows the latency of the WMMA operations to be hidden by barrier wait. --- -name: test +name: test_workgroup tracksRegLiveness: true body: | bb.0: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 - ; CHECK-LABEL: name: test + ; CHECK-LABEL: name: test_workgroup ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ATOMIC_FENCE 5, 2 @@ -81,3 +97,262 @@ body: | ATOMIC_FENCE 4, 2 ... + +# Ensure VALU operations are not unduely redistributed between wavefront fences +# causing a loss of latency hiding. +--- +name: test_wavefront +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14 + ; CHECK-LABEL: name: test_wavefront + ; CHECK: liveins: $vgpr1, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr14 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: ATOMIC_FENCE 6, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: BUNDLE implicit killed $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit killed $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 { + ; CHECK-NEXT: DS_WRITE2_B32_gfx9 killed $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec + ; CHECK-NEXT: DS_WRITE2ST64_B32_gfx9 killed $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 2 + ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec + ; CHECK-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec + ; CHECK-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec + ; CHECK-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec + ; CHECK-NEXT: $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec + ; CHECK-NEXT: $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec + ; CHECK-NEXT: $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec + ; CHECK-NEXT: $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec + ; CHECK-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec + ; CHECK-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec + ; CHECK-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr34, killed $vgpr35, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec + ; CHECK-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec + ; CHECK-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr36, killed $vgpr37, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec + ; CHECK-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec + ; CHECK-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr38, killed $vgpr39, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec + ; CHECK-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr7, 2156, 0, implicit $exec + ; CHECK-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr6, 1728, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr40, killed $vgpr41, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr28, implicit-def $vgpr29, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr7, 2160, 0, implicit $exec + ; CHECK-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr6, 1792, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: BUNDLE implicit-def $vgpr30, implicit-def $vgpr31, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + ; CHECK-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr7, 2164, 0, implicit $exec + ; CHECK-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr6, 1856, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: ATOMIC_FENCE 6, 3 + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr42, killed $vgpr43, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: BUNDLE implicit-def $vgpr32, implicit-def $vgpr33, implicit killed $vgpr7, implicit $exec, implicit killed $vgpr6 { + ; CHECK-NEXT: $vgpr32 = DS_READ_B32_gfx9 killed $vgpr7, 2168, 0, implicit $exec + ; CHECK-NEXT: $vgpr33 = DS_READ_B32_gfx9 killed $vgpr6, 1920, 0, implicit $exec + ; CHECK-NEXT: } + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr44, killed $vgpr45, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr46, killed $vgpr47, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr10, killed $vgpr11, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr12, killed $vgpr13, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr14, killed $vgpr15, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr16, killed $vgpr17, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr18, killed $vgpr19, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr20, killed $vgpr21, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr22, killed $vgpr23, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr24, killed $vgpr25, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr26, killed $vgpr27, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr28, killed $vgpr29, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr30, killed $vgpr31, killed $vgpr1, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr32, killed $vgpr33, killed $vgpr1, implicit $mode, implicit $exec + ATOMIC_FENCE 6, 3 + ATOMIC_FENCE 6, 2 + S_BARRIER + BUNDLE implicit $vgpr9, implicit killed $vgpr10, implicit killed $vgpr12, implicit $exec, implicit $vgpr8, implicit killed $vgpr11, implicit killed $vgpr14 { + DS_WRITE2_B32_gfx9 $vgpr9, killed $vgpr10, killed $vgpr12, 0, 16, 0, implicit $exec + DS_WRITE2ST64_B32_gfx9 $vgpr8, killed $vgpr11, killed $vgpr14, 0, 4, 0, implicit $exec + } + ATOMIC_FENCE 6, 2 + S_BARRIER + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr34, implicit-def $vgpr35, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr34 = DS_READ_B32_gfx9 $vgpr7, 2096, 0, implicit $exec + $vgpr35 = DS_READ_B32_gfx9 $vgpr6, 768, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr36, implicit-def $vgpr37, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr36 = DS_READ_B32_gfx9 $vgpr7, 2100, 0, implicit $exec + $vgpr37 = DS_READ_B32_gfx9 $vgpr6, 832, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr38, implicit-def $vgpr39, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr38 = DS_READ_B32_gfx9 $vgpr7, 2104, 0, implicit $exec + $vgpr39 = DS_READ_B32_gfx9 $vgpr6, 896, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr40, implicit-def $vgpr41, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr40 = DS_READ_B32_gfx9 $vgpr7, 2108, 0, implicit $exec + $vgpr41 = DS_READ_B32_gfx9 $vgpr6, 960, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr42, implicit-def $vgpr43, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr42 = DS_READ_B32_gfx9 $vgpr7, 2112, 0, implicit $exec + $vgpr43 = DS_READ_B32_gfx9 $vgpr6, 1024, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr44, implicit-def $vgpr45, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr44 = DS_READ_B32_gfx9 $vgpr7, 2116, 0, implicit $exec + $vgpr45 = DS_READ_B32_gfx9 $vgpr6, 1088, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr46, implicit-def $vgpr47, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr46 = DS_READ_B32_gfx9 $vgpr7, 2120, 0, implicit $exec + $vgpr47 = DS_READ_B32_gfx9 $vgpr6, 1152, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr10, implicit-def $vgpr11, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr10 = DS_READ_B32_gfx9 $vgpr7, 2124, 0, implicit $exec + $vgpr11 = DS_READ_B32_gfx9 $vgpr6, 1216, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr12, implicit-def $vgpr13, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr12 = DS_READ_B32_gfx9 $vgpr7, 2128, 0, implicit $exec + $vgpr13 = DS_READ_B32_gfx9 $vgpr6, 1280, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr14, implicit-def $vgpr15, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr14 = DS_READ_B32_gfx9 $vgpr7, 2132, 0, implicit $exec + $vgpr15 = DS_READ_B32_gfx9 $vgpr6, 1344, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr16, implicit-def $vgpr17, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr16 = DS_READ_B32_gfx9 $vgpr7, 2136, 0, implicit $exec + $vgpr17 = DS_READ_B32_gfx9 $vgpr6, 1408, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr18, implicit-def $vgpr19, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr18 = DS_READ_B32_gfx9 $vgpr7, 2140, 0, implicit $exec + $vgpr19 = DS_READ_B32_gfx9 $vgpr6, 1472, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr20 = DS_READ_B32_gfx9 $vgpr7, 2144, 0, implicit $exec + $vgpr21 = DS_READ_B32_gfx9 $vgpr6, 1536, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr22 = DS_READ_B32_gfx9 $vgpr7, 2148, 0, implicit $exec + $vgpr23 = DS_READ_B32_gfx9 $vgpr6, 1600, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr24, implicit-def $vgpr25, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr24 = DS_READ_B32_gfx9 $vgpr7, 2152, 0, implicit $exec + $vgpr25 = DS_READ_B32_gfx9 $vgpr6, 1664, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr26, implicit-def $vgpr27, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr26 = DS_READ_B32_gfx9 $vgpr7, 2156, 0, implicit $exec + $vgpr27 = DS_READ_B32_gfx9 $vgpr6, 1728, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr28, implicit-def $vgpr29, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr28 = DS_READ_B32_gfx9 $vgpr7, 2160, 0, implicit $exec + $vgpr29 = DS_READ_B32_gfx9 $vgpr6, 1792, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr30, implicit-def $vgpr31, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr30 = DS_READ_B32_gfx9 $vgpr7, 2164, 0, implicit $exec + $vgpr31 = DS_READ_B32_gfx9 $vgpr6, 1856, 0, implicit $exec + } + ATOMIC_FENCE 6, 3 + BUNDLE implicit-def $vgpr32, implicit-def $vgpr33, implicit $vgpr7, implicit $exec, implicit $vgpr6 { + $vgpr32 = DS_READ_B32_gfx9 $vgpr7, 2168, 0, implicit $exec + $vgpr33 = DS_READ_B32_gfx9 $vgpr6, 1920, 0, implicit $exec + } + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr34, killed $vgpr35, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr36, killed $vgpr37, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr38, killed $vgpr39, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr40, killed $vgpr41, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr42, killed $vgpr43, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr44, killed $vgpr45, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr46, killed $vgpr47, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr10, killed $vgpr11, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr12, killed $vgpr13, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr14, killed $vgpr15, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr16, killed $vgpr17, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr18, killed $vgpr19, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr20, killed $vgpr21, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr22, killed $vgpr23, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr24, killed $vgpr25, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr26, killed $vgpr27, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr28, killed $vgpr29, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr30, killed $vgpr31, killed $vgpr1, implicit $mode, implicit $exec + $vgpr1 = nofpexcept V_FMAC_F32_e32 killed $vgpr32, killed $vgpr33, killed $vgpr1, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll index f47c2f2a85156..b773c8d11248a 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll @@ -16,3 +16,263 @@ define i32 @cvt_rna_satfinite_tf32_f32(float %f1) { %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1) ret i32 %val } + +define <2 x bfloat> @cvt_rn_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rn_relu_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.relu.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rz_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rz_relu_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_relu_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.relu.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float, float) + +define <2 x half> @cvt_rn_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.relu.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rz_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rz_relu_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_relu_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.relu.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +declare <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float, float) + +define bfloat @cvt_rn_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rn_relu_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_relu_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.relu.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rz_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rz_relu_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_relu_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.relu.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float %f1) + ret bfloat %val +} + +declare bfloat @llvm.nvvm.f2bf16.rn.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rz.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float) + +define half @cvt_rn_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.satfinite(float %f1) + ret half %val +} + +define half @cvt_rn_relu_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_relu_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.relu.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.relu.satfinite(float %f1) + ret half %val +} + +define half @cvt_rz_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.satfinite(float %f1) + ret half %val +} + +define half @cvt_rz_relu_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_relu_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.relu.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.relu.satfinite(float %f1) + ret half %val +} + +declare half @llvm.nvvm.f2f16.rn.satfinite(float) +declare half @llvm.nvvm.f2f16.rn.relu.satfinite(float) +declare half @llvm.nvvm.f2f16.rz.satfinite(float) +declare half @llvm.nvvm.f2f16.rz.relu.satfinite(float) diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll index edf1739ae9928..a47bbabdd448c 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll @@ -198,6 +198,71 @@ declare bfloat @llvm.nvvm.f2bf16.rn.relu(float) declare bfloat @llvm.nvvm.f2bf16.rz(float) declare bfloat @llvm.nvvm.f2bf16.rz.relu(float) +define half @cvt_rn_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rn_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_param_0]; +; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn(float %f1) + ret half %val +} + +define half @cvt_rn_relu_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rn_relu_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_param_0]; +; CHECK-NEXT: cvt.rn.relu.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.relu(float %f1) + ret half %val +} + +define half @cvt_rz_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rz_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_param_0]; +; CHECK-NEXT: cvt.rz.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz(float %f1) + ret half %val +} + +define half @cvt_rz_relu_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rz_relu_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_param_0]; +; CHECK-NEXT: cvt.rz.relu.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.relu(float %f1) + ret half %val +} + +declare half @llvm.nvvm.f2f16.rn(float) +declare half @llvm.nvvm.f2f16.rn.relu(float) +declare half @llvm.nvvm.f2f16.rz(float) +declare half @llvm.nvvm.f2f16.rz.relu(float) + define i32 @cvt_rna_tf32_f32(float %f1) { ; CHECK-LABEL: cvt_rna_tf32_f32( ; CHECK: { diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll new file mode 100644 index 0000000000000..d46408e31752f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async_generic_acquire_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_acquire_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_generic_release_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_release_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll new file mode 100644 index 0000000000000..896c624602a60 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async() { +; CHECK-LABEL: test_nvvm_fence_proxy_async( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async() + ret void +} + +define void @test_nvvm_fence_proxy_async_global() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_global( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.global; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.global() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cluster() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cluster( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cta() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cta( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cta; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cta() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll new file mode 100644 index 0000000000000..ab35e4fb396d6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll @@ -0,0 +1,8 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 -o /dev/null 2>&1 | FileCheck %s + +define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) { + ; CHECK: immarg value 130 out of range [128, 129) + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 130); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy.ll b/llvm/test/CodeGen/NVPTX/fence-proxy.ll new file mode 100644 index 0000000000000..cb5679e68944d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_70 && ptxas-isa-7.5 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | %ptxas-verify -arch=sm_70 %} + +define void @test_nvvm_fence_proxy_alias() { +; CHECK-LABEL: test_nvvm_fence_proxy_alias( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.alias; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.alias() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/op-fence.ll b/llvm/test/CodeGen/NVPTX/op-fence.ll new file mode 100644 index 0000000000000..629b702742afb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/op-fence.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_mbarrier_init +define void @test_fence_mbarrier_init() { +; CHECK-LABEL: test_fence_mbarrier_init( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.mbarrier_init.release.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.mbarrier_init.release.cluster(); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/thread-fence.ll b/llvm/test/CodeGen/NVPTX/thread-fence.ll new file mode 100644 index 0000000000000..185461bd183d0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/thread-fence.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_acquire +define void @test_fence_acquire() { +; CHECK-LABEL: test_fence_acquire( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster(); + + ret void +} + +; CHECK-LABEL: test_fence_release +define void @test_fence_release() { +; CHECK-LABEL: test_fence_release( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster(); + + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/saddo-ssubo.ll b/llvm/test/CodeGen/PowerPC/saddo-ssubo.ll index c0f3b60122521..5cd96ec219404 100644 --- a/llvm/test/CodeGen/PowerPC/saddo-ssubo.ll +++ b/llvm/test/CodeGen/PowerPC/saddo-ssubo.ll @@ -49,12 +49,11 @@ entry: define i1 @test_saddo_i32(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: test_saddo_i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: add 5, 3, 4 -; CHECK-NEXT: cmpwi 1, 4, 0 -; CHECK-NEXT: cmpw 5, 3 -; CHECK-NEXT: li 3, 1 -; CHECK-NEXT: creqv 20, 4, 0 -; CHECK-NEXT: isel 3, 0, 3, 20 +; CHECK-NEXT: xor 5, 3, 4 +; CHECK-NEXT: add 4, 3, 4 +; CHECK-NEXT: xor 3, 4, 3 +; CHECK-NEXT: andc 3, 3, 5 +; CHECK-NEXT: srwi 3, 3, 31 ; CHECK-NEXT: blr entry: %res = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -65,12 +64,11 @@ entry: define i1 @test_saddo_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: test_saddo_i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: add 5, 3, 4 -; CHECK-NEXT: cmpdi 1, 4, 0 -; CHECK-NEXT: cmpd 5, 3 -; CHECK-NEXT: li 3, 1 -; CHECK-NEXT: creqv 20, 4, 0 -; CHECK-NEXT: isel 3, 0, 3, 20 +; CHECK-NEXT: xor 5, 3, 4 +; CHECK-NEXT: add 4, 3, 4 +; CHECK-NEXT: xor 3, 4, 3 +; CHECK-NEXT: andc 3, 3, 5 +; CHECK-NEXT: rldicl 3, 3, 1, 63 ; CHECK-NEXT: blr entry: %res = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll index cf5d0f107359a..c25e337777631 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt %s -S -riscv-codegenprepare -mtriple=riscv64 | FileCheck %s +; RUN: opt %s -S -passes=riscv-codegenprepare -mtriple=riscv64 | FileCheck %s ; Make sure we convert the 4294967294 in for.body.preheader.new to -2 based on ; the upper 33 bits being zero by the dominating condition %cmp3. diff --git a/llvm/test/CodeGen/X86/vector-compress-freeze.ll b/llvm/test/CodeGen/X86/vector-compress-freeze.ll new file mode 100644 index 0000000000000..981557f9b56cf --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-compress-freeze.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl | FileCheck %s + +declare <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32>, <16 x i1>, <16 x i32>) + +define <16 x i32> @test_compress_freeze_elimination(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a3) { +; CHECK-LABEL: test_compress_freeze_elimination: +; CHECK: # %bb.0: +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpcompressd %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %cmp = icmp sgt <16 x i32> %a0, %a1 + %ext = zext <16 x i8> %a3 to <16 x i32> + %cpr = call <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32> %ext, <16 x i1> %cmp, <16 x i32> splat(i32 15)) + %fr = freeze <16 x i32> %cpr + %and = and <16 x i32> %fr, splat(i32 255) + ret <16 x i32> %and +} + +define <16 x i32> @test_compress_freeze(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a3) { +; CHECK-LABEL: test_compress_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; CHECK-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq + %cmp = icmp sgt <16 x i32> %a0, %a1 + %ext = zext <16 x i8> %a3 to <16 x i32> + %cpr = call <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32> %ext, <16 x i1> %cmp, <16 x i32> poison) + %fr = freeze <16 x i32> %cpr + %and = and <16 x i32> %fr, splat(i32 255) + ret <16 x i32> %and +} diff --git a/llvm/test/TableGen/target-specialized-pseudos.td b/llvm/test/TableGen/target-specialized-pseudos.td new file mode 100644 index 0000000000000..99c63f3ec29d9 --- /dev/null +++ b/llvm/test/TableGen/target-specialized-pseudos.td @@ -0,0 +1,101 @@ +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DONECASE -o - | FileCheck -check-prefixes=CHECK,ONECASE %s +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DALLCASES -o - | FileCheck -check-prefixes=CHECK,ALLCASES %s +// RUN: not llvm-tblgen -gen-instr-info -I %p/../../include %s -DERROR -o /dev/null 2>&1 | FileCheck -check-prefix=ERROR %s + +// CHECK: namespace llvm::MyTarget { +// CHECK: enum { +// CHECK: LOAD_STACK_GUARD = [[LOAD_STACK_GUARD_OPCODE:[0-9]+]], +// CHECK: PREALLOCATED_ARG = [[PREALLOCATED_ARG_OPCODE:[0-9]+]], +// CHECK: PATCHABLE_EVENT_CALL = [[PATCHABLE_EVENT_CALL_OPCODE:[0-9]+]], +// CHECK: PATCHABLE_TYPED_EVENT_CALL = [[PATCHABLE_TYPED_EVENT_CALL_OPCODE:[0-9]+]], + +// Make sure no enum entry is emitted for MY_LOAD_STACK_GUARD +// CHECK: G_UBFX = [[G_UBFX_OPCODE:[0-9]+]], +// CHECK-NEXT: MY_MOV = [[MY_MOV_OPCODE:[0-9]+]], +// CHECK-NEXT: INSTRUCTION_LIST_END = [[INSTR_LIST_END_OPCODE:[0-9]+]] + + +// CHECK: extern const MyTargetInstrTable MyTargetDescs = { +// CHECK-NEXT: { +// CHECK-NEXT: { [[MY_MOV_OPCODE]], 2, 1, 2, 0, 0, 0, {{[0-9]+}}, MyTargetImpOpBase + 0, 0|(1ULL< + : Register { + let Namespace = "MyTarget"; +} + +class MyClass types, dag registers> + : RegisterClass<"MyTarget", types, size, registers> { + let Size = size; +} + +def X0 : MyReg<"x0">; +def X1 : MyReg<"x1">; +def XRegs : RegisterClass<"MyTarget", [i64], 64, (add X0, X1)>; + + +class TestInstruction : Instruction { + let Size = 2; + let Namespace = "MyTarget"; + let hasSideEffects = false; +} + +#ifdef ONECASE + +// Example setting the pointer register class manually +def MY_LOAD_STACK_GUARD : + TargetSpecializedStandardPseudoInstruction { + let Namespace = "MyTarget"; + let OutOperandList = (outs XRegs:$dst); +} + +#endif + +#ifdef ALLCASES + +defm my_remaps : RemapAllTargetPseudoPointerOperands; + +#endif + + +#ifdef ERROR + +def MY_LOAD_STACK_GUARD_0 : TargetSpecializedStandardPseudoInstruction; + +// ERROR: :[[@LINE+1]]:5: error: multiple overrides of 'LOAD_STACK_GUARD' defined +def MY_LOAD_STACK_GUARD_1 : TargetSpecializedStandardPseudoInstruction; + +#endif + +def MY_MOV : TestInstruction { + let OutOperandList = (outs XRegs:$dst); + let InOperandList = (ins XRegs:$src); + let AsmString = "my_mov $dst, $src"; +} + + +def MyTargetISA : InstrInfo; +def MyTarget : Target { let InstructionSet = MyTargetISA; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 5d80353fcadd8..0c91661d20ae7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -9,13 +9,8 @@ target triple = "aarch64-linux-gnu" define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { ; CHECK-LABEL: define i64 @vector_loop_with_remaining_iterations( ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ITER_CHECK:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer @@ -39,58 +34,12 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] -; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = insertelement zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[BC_RESUME_VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], splat (i64 1) -; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP26]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], [[VEC_IND]], i32 0, i64 3 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i8.nxv2p0( align 1 [[TMP28]], splat (i1 true), poison) -; CHECK-NEXT: [[TMP29:%.*]] = zext [[WIDE_MASKED_GATHER]] to -; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.umin.nxv2i32( [[TMP24]], [[TMP29]]) -; CHECK-NEXT: [[TMP31:%.*]] = call @llvm.umin.nxv2i32( [[TMP24]], [[TMP30]]) -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]] -; CHECK-NEXT: store zeroinitializer, ptr [[TMP32]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = zext [[TMP31]] to -; CHECK-NEXT: [[TMP35]] = or [[VEC_PHI8]], [[TMP34]] -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] -; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64( [[TMP35]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 16, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[TMP13]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 ; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 ; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 @@ -104,7 +53,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] @@ -140,13 +89,8 @@ exit: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { ; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations( ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ITER_CHECK:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer @@ -167,61 +111,15 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] -; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = insertelement zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[BC_RESUME_VAL]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = mul [[TMP25]], splat (i64 1) -; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP38]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], [[VEC_IND]], i32 0, i64 3 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i8.nxv2p0( align 1 [[TMP28]], splat (i1 true), poison) -; CHECK-NEXT: [[TMP29:%.*]] = zext [[WIDE_MASKED_GATHER]] to -; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.umin.nxv2i32( [[TMP24]], [[TMP29]]) -; CHECK-NEXT: [[TMP31:%.*]] = call @llvm.umin.nxv2i32( [[TMP24]], [[TMP30]]) -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]] -; CHECK-NEXT: store zeroinitializer, ptr [[TMP32]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = zext [[TMP31]] to -; CHECK-NEXT: [[TMP35]] = or [[VEC_PHI8]], [[TMP34]] -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] -; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64( [[TMP35]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 16, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[TMP13]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 ; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 ; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 @@ -235,7 +133,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] @@ -308,7 +206,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH:.*]] ; CHECK: [[SCALAR_PH]]: @@ -332,7 +230,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14 -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index 50807df51c99e..1596b60f48567 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -9,15 +9,15 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT-LABEL: define void @cost_store_i8( ; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] { ; DEFAULT-NEXT: iter.check: -; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP10]], 3 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP13]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; DEFAULT: vector.main.loop.iter.check: ; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5 +; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 101, [[TMP1]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; DEFAULT: vector.main.loop.iter.check: +; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 5 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP10]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: ; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 @@ -39,22 +39,22 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 101, [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP13]] +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP1]] ; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; DEFAULT: vec.epilog.ph: ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; DEFAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8 -; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP12]] +; DEFAULT-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4 +; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP15]] ; DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]] ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: -; DEFAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX1]] -; DEFAULT-NEXT: store zeroinitializer, ptr [[TMP9]], align 1 -; DEFAULT-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP12]] -; DEFAULT-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC3]] -; DEFAULT-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX5]] +; DEFAULT-NEXT: store zeroinitializer, ptr [[TMP19]], align 1 +; DEFAULT-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP15]] +; DEFAULT-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] +; DEFAULT-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; DEFAULT: vec.epilog.middle.block: ; DEFAULT-NEXT: [[CMP_N6:%.*]] = icmp eq i64 101, [[N_VEC3]] ; DEFAULT-NEXT: br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -146,12 +146,12 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] ; DEFAULT-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP5]], align 1, !alias.scope [[META9]], !noalias [[META6]] ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 -; DEFAULT-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 +; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; DEFAULT: middle.block: ; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] +; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]] ; DEFAULT: vec.epilog.ph: ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0 @@ -165,11 +165,11 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: ; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]] -; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP11]], align 1, !alias.scope [[META9]], !noalias [[META6]] +; DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]] +; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP13]], align 1, !alias.scope [[META9]], !noalias [[META6]] ; DEFAULT-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 8 ; DEFAULT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 1000 -; DEFAULT-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; DEFAULT: vec.epilog.middle.block: ; DEFAULT-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: @@ -185,7 +185,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: store i8 [[TRUNC]], ptr [[GEP]], align 1 ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 -; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void ; @@ -268,7 +268,7 @@ attributes #1 = { vscale_range(1,16) "target-features"="+sve" } ; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; DEFAULT: [[PROF3]] = !{!"branch_weights", i32 8, i32 24} +; DEFAULT: [[PROF3]] = !{!"branch_weights", i32 4, i32 28} ; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ; DEFAULT: [[META6]] = !{[[META7:![0-9]+]]} @@ -277,8 +277,9 @@ attributes #1 = { vscale_range(1,16) "target-features"="+sve" } ; DEFAULT: [[META9]] = !{[[META10:![0-9]+]]} ; DEFAULT: [[META10]] = distinct !{[[META10]], [[META8]]} ; DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; DEFAULT: [[PROF12]] = !{!"branch_weights", i32 8, i32 24} +; DEFAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} +; DEFAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;. ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index fe9eb89049bc3..2bf285e94089f 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -347,19 +347,13 @@ define void @recipe_debug_loc_location(ptr nocapture %src) !dbg !5 { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> -; CHECK-NEXT: CLONE ir<%isd> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> -; CHECK-NOT: !dbg -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%isd> -; CHECK-NOT: !dbg -; CHECK-NEXT: WIDEN ir<%lsd> = load vp<[[VEC_PTR]]> -; CHECK-NOT: !dbg -; CHECK-NEXT: WIDEN ir<%psd> = add nuw nsw ir<%lsd>, ir<23> -; CHECK-NOT: !dbg -; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%lsd>, ir<100> -; CHECK-NOT: !dbg +; CHECK-NEXT: CLONE ir<%isd> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>, !dbg /tmp/s.c:5:3 +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%isd>, !dbg /tmp/s.c:6:3 +; CHECK-NEXT: WIDEN ir<%lsd> = load vp<[[VEC_PTR]]>, !dbg /tmp/s.c:6:3 +; CHECK-NEXT: WIDEN ir<%psd> = add nuw nsw ir<%lsd>, ir<23>, !dbg /tmp/s.c:7:3 +; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%lsd>, ir<100>, !dbg /tmp/s.c:8:3 ; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%cmp1>, !dbg /tmp/s.c:9:3 -; CHECK-NEXT: WIDEN ir<%cmp2> = icmp sge ir<%lsd>, ir<200> -; CHECK-NOT: !dbg +; CHECK-NEXT: WIDEN ir<%cmp2> = icmp sge ir<%lsd>, ir<200>, !dbg /tmp/s.c:10:3 ; CHECK-NEXT: EMIT vp<[[SEL1:%.+]]> = logical-and vp<[[NOT1]]>, ir<%cmp2>, !dbg /tmp/s.c:11:3 ; CHECK-NEXT: EMIT vp<[[OR1:%.+]]> = or vp<[[SEL1]]>, ir<%cmp1> ; CHECK-NEXT: Successor(s): pred.sdiv @@ -370,24 +364,19 @@ define void @recipe_debug_loc_location(ptr nocapture %src) !dbg !5 { ; CHECK-NEXT: Successor(s): pred.sdiv.if, pred.sdiv.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.sdiv.if: -; CHECK-NEXT: REPLICATE ir<%sd1> = sdiv ir<%psd>, ir<%lsd> (S->V) -; CHECK-NOT: !dbg +; CHECK-NEXT: REPLICATE ir<%sd1> = sdiv ir<%psd>, ir<%lsd> (S->V), !dbg /tmp/s.c:12:3 ; CHECK-NEXT: Successor(s): pred.sdiv.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.sdiv.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PHI:%.+]]> = ir<%sd1> -; CHECK-NOT: !dbg +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PHI:%.+]]> = ir<%sd1>, !dbg /tmp/s.c:12:3 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): if.then.0 ; CHECK-EMPTY: ; CHECK-NEXT: if.then.0: -; CHECK-NEXT: BLEND ir<%ysd.0> = ir<%psd> vp<[[PHI]]>/vp<[[OR1]]> -; CHECK-NOT: !dbg -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd> -; CHECK-NOT: !dbg -; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0> -; CHECK-NOT: !dbg +; CHECK-NEXT: BLEND ir<%ysd.0> = ir<%psd> vp<[[PHI]]>/vp<[[OR1]]>, !dbg /tmp/s.c:14:3 +; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd>, !dbg /tmp/s.c:15:3 +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0>, !dbg /tmp/s.c:15:3 ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LowerTypeTests/function-weak.ll b/llvm/test/Transforms/LowerTypeTests/function-weak.ll index 4ea03b6c2c1fa..dbbe8fa4a0a9a 100644 --- a/llvm/test/Transforms/LowerTypeTests/function-weak.ll +++ b/llvm/test/Transforms/LowerTypeTests/function-weak.ll @@ -32,10 +32,10 @@ target triple = "x86_64-unknown-linux-gnu" declare !type !0 extern_weak void @f() ; CHECK: define zeroext i1 @check_f() -define zeroext i1 @check_f() { +define zeroext i1 @check_f() !prof !{!"function_entry_count", i32 10} { entry: ; CHECK: [[CMP:%.*]] = icmp ne ptr @f, null -; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null +; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null, !prof ![[SELPROF:[0-9]+]] ; CHECK: [[PTI:%.*]] = ptrtoint ptr [[SEL]] to i1 ; CHECK: ret i1 [[PTI]] ret i1 ptrtoint (ptr @f to i1) @@ -165,3 +165,4 @@ define i1 @foo(ptr %p) { ; CHECK-NEXT: } !0 = !{i32 0, !"typeid1"} +; CHECK: ![[SELPROF]] = !{!"unknown", !"lowertypetests"} \ No newline at end of file diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp index f1cf87fc88cee..847ee3263c02f 100644 --- a/llvm/tools/lli/lli.cpp +++ b/llvm/tools/lli/lli.cpp @@ -283,7 +283,6 @@ static ExitOnError ExitOnErr; LLVM_ATTRIBUTE_USED static void linkComponents() { errs() << (void *)&llvm_orc_registerEHFrameSectionAllocAction << (void *)&llvm_orc_deregisterEHFrameSectionAllocAction - << (void *)&llvm_orc_registerJITLoaderGDBWrapper << (void *)&llvm_orc_registerJITLoaderGDBAllocAction; } diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp index 8d33ae1edcaaa..2cffca27623e9 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp @@ -44,7 +44,6 @@ ExitOnError ExitOnErr; LLVM_ATTRIBUTE_USED void linkComponents() { errs() << (void *)&llvm_orc_registerEHFrameSectionAllocAction << (void *)&llvm_orc_deregisterEHFrameSectionAllocAction - << (void *)&llvm_orc_registerJITLoaderGDBWrapper << (void *)&llvm_orc_registerJITLoaderGDBAllocAction; } diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index cf5200a73e5cc..b8de817ec16c8 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -24,7 +24,6 @@ #include "llvm/ExecutionEngine/Orc/Debugging/VTuneSupportPlugin.h" #include "llvm/ExecutionEngine/Orc/EHFrameRegistrationPlugin.h" #include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h" -#include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h" @@ -347,7 +346,6 @@ static LLVM_ATTRIBUTE_USED void linkComponents() { errs() << "Linking in runtime functions\n" << (void *)&llvm_orc_registerEHFrameSectionAllocAction << '\n' << (void *)&llvm_orc_deregisterEHFrameSectionAllocAction << '\n' - << (void *)&llvm_orc_registerJITLoaderGDBWrapper << '\n' << (void *)&llvm_orc_registerJITLoaderGDBAllocAction << '\n' << (void *)&llvm_orc_registerJITLoaderPerfStart << '\n' << (void *)&llvm_orc_registerJITLoaderPerfEnd << '\n' diff --git a/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp b/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp index d3fcb948fafb8..0ddbb33c4bac5 100644 --- a/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/OrcCAPITest.cpp @@ -547,7 +547,6 @@ TEST_F(OrcCAPITestBase, DISABLED_EnableDebugSupport) { #else static LLVM_ATTRIBUTE_USED void linkComponents() { errs() << "Linking in runtime functions\n" - << (void *)&llvm_orc_registerJITLoaderGDBWrapper << '\n' << (void *)&llvm_orc_registerJITLoaderGDBAllocAction << '\n'; } TEST_F(OrcCAPITestBase, EnableDebugSupport) { diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index e5025784d304d..35ec495b93ba2 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -80,6 +80,7 @@ #include "TableGenBackends.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/TableGen/CodeGenHelpers.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -549,9 +550,8 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { if (InstrMapVec.empty()) return; - OS << "#ifdef GET_INSTRMAP_INFO\n"; - OS << "#undef GET_INSTRMAP_INFO\n"; - OS << "namespace llvm::" << NameSpace << " {\n\n"; + IfDefEmitter IfDef(OS, "GET_INSTRMAP_INFO"); + NamespaceEmitter NS(OS, ("llvm::" + NameSpace).str()); // Emit coulumn field names and their values as enums. emitEnums(OS, Records); @@ -574,6 +574,4 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { // Emit map tables and the functions to query them. IMap.emitTablesWithFunc(OS); } - OS << "} // end namespace llvm::" << NameSpace << '\n'; - OS << "#endif // GET_INSTRMAP_INFO\n\n"; } diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp index c0daac127f71a..e080ca0aa0b31 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp @@ -283,15 +283,25 @@ void CodeGenTarget::ComputeInstrsByEnum() const { assert(EndOfPredefines == getNumFixedInstructions() && "Missing generic opcode"); + [[maybe_unused]] unsigned SkippedInsts = 0; + for (const auto &[_, CGIUp] : InstMap) { const CodeGenInstruction *CGI = CGIUp.get(); if (CGI->Namespace != "TargetOpcode") { + + if (CGI->TheDef->isSubClassOf( + "TargetSpecializedStandardPseudoInstruction")) { + ++SkippedInsts; + continue; + } + InstrsByEnum.push_back(CGI); NumPseudoInstructions += CGI->TheDef->getValueAsBit("isPseudo"); } } - assert(InstrsByEnum.size() == InstMap.size() && "Missing predefined instr"); + assert(InstrsByEnum.size() + SkippedInsts == InstMap.size() && + "Missing predefined instr"); // All of the instructions are now in random order based on the map iteration. llvm::sort( diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 32994c12aa98b..d46c9d811753a 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -72,6 +72,13 @@ class InstrInfoEmitter { using OperandInfoListTy = std::vector; using OperandInfoMapTy = std::map; + DenseMap + TargetSpecializedPseudoInsts; + + /// Compute mapping of opcodes which should have their definitions overridden + /// by a target version. + void buildTargetSpecializedPseudoInstsMap(); + /// Generate member functions in the target-specific GenInstrInfo class. /// /// This method is used to custom expand TIIPredicate definitions. @@ -216,6 +223,10 @@ InstrInfoEmitter::CollectOperandInfo(OperandInfoListTy &OperandInfoList, const CodeGenTarget &Target = CDP.getTargetInfo(); unsigned Offset = 0; for (const CodeGenInstruction *Inst : Target.getInstructions()) { + auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst); + if (OverrideEntry != TargetSpecializedPseudoInsts.end()) + Inst = OverrideEntry->second; + OperandInfoTy OperandInfo = GetOperandInfo(*Inst); if (OperandInfoMap.try_emplace(OperandInfo, Offset).second) { OperandInfoList.push_back(OperandInfo); @@ -859,6 +870,25 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS, } } +void InstrInfoEmitter::buildTargetSpecializedPseudoInstsMap() { + ArrayRef SpecializedInsts = Records.getAllDerivedDefinitions( + "TargetSpecializedStandardPseudoInstruction"); + const CodeGenTarget &Target = CDP.getTargetInfo(); + + for (const Record *SpecializedRec : SpecializedInsts) { + const CodeGenInstruction &SpecializedInst = + Target.getInstruction(SpecializedRec); + const Record *BaseInstRec = SpecializedRec->getValueAsDef("Instruction"); + + const CodeGenInstruction &BaseInst = Target.getInstruction(BaseInstRec); + + if (!TargetSpecializedPseudoInsts.insert({&BaseInst, &SpecializedInst}) + .second) + PrintFatalError(SpecializedRec, "multiple overrides of '" + + BaseInst.getName() + "' defined"); + } +} + //===----------------------------------------------------------------------===// // Main Output. //===----------------------------------------------------------------------===// @@ -881,6 +911,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { // Collect all of the operand info records. Timer.startTimer("Collect operand info"); + buildTargetSpecializedPseudoInstsMap(); + OperandInfoListTy OperandInfoList; OperandInfoMapTy OperandInfoMap; unsigned OperandInfoSize = @@ -963,6 +995,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) { for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) { // Keep a list of the instruction names. InstrNames.add(Inst->getName()); + + auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst); + if (OverrideEntry != TargetSpecializedPseudoInsts.end()) + Inst = OverrideEntry->second; + // Emit the record into the table. emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS); } diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn index ab3b717eed69d..106db83491449 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn @@ -25,7 +25,6 @@ static_library("Orc") { "DebugUtils.cpp", "EHFrameRegistrationPlugin.cpp", "ELFNixPlatform.cpp", - "EPCDebugObjectRegistrar.cpp", "EPCDynamicLibrarySearchGenerator.cpp", "EPCGenericDylibManager.cpp", "EPCGenericJITLinkMemoryManager.cpp", diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index a8cf2b1625f8f..835025d1e319e 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -595,6 +595,7 @@ Transforms/PGOProfile/memop_profile_funclet_wasm.ll Transforms/PGOProfile/X86/macho.ll Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll Transforms/PhaseOrdering/AArch64/globals-aa-required-for-vectorization.ll +Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll Transforms/PhaseOrdering/X86/merge-functions2.ll Transforms/PhaseOrdering/X86/merge-functions3.ll Transforms/PhaseOrdering/X86/merge-functions.ll diff --git a/mlir/docs/Dialects/NVVM/_index.md b/mlir/docs/Dialects/NVVMDialect.md similarity index 93% rename from mlir/docs/Dialects/NVVM/_index.md rename to mlir/docs/Dialects/NVVMDialect.md index f4832f76f86ad..b50980258593d 100644 --- a/mlir/docs/Dialects/NVVM/_index.md +++ b/mlir/docs/Dialects/NVVMDialect.md @@ -8,6 +8,8 @@ NVPTX toolchain. While a NVVM op usually maps to a single LLVM IR intrinsic, the NVVM dialect uses type polymorphism and other attributes so that a single NVVM op can map to different LLVM intrinsics. +[TOC] + ## Scope and Capabilities The dialect covers core GPU features such as thread/block builtins, barriers @@ -81,4 +83,18 @@ introduce high-level ops that expand into multiple unrelated NVVM intrinsics or that lower to no intrinsic at all. Such abstractions belong in higher-level dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects). The design intent is a thin, predictable, low-level surface with near-mechanical lowering -to NVVM/LLVM IR. \ No newline at end of file +to NVVM/LLVM IR. + + +## Operations + +All operations in the NVIDIA's instruction set have a custom form in MLIR. The mnemonic +of an operation is that used in LLVM IR prefixed with "`nvvm.`". + +[include "Dialects/NVVMOps.md"] + + +## Op Interfaces + +[include "Dialects/NVVMRequiresSMTraits.md"] + diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 8d5bc7333d47f..524b9f820f290 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -4598,6 +4598,551 @@ def NVVM_ClusterLaunchControlQueryCancelOp }]; } +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma Ops +//===----------------------------------------------------------------------===// + +def Tcgen05MMAKindF16 : I32EnumAttrCase<"F16", 0, "f16">; +def Tcgen05MMAKindTF32 : I32EnumAttrCase<"TF32", 1, "tf32">; +def Tcgen05MMAKindF8F6F4 : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">; +def Tcgen05MMAKindINT8 : I32EnumAttrCase<"I8", 3, "i8">; + +def Tcgen05MMAKind : I32EnumAttr< + "Tcgen05MMAKind", + "tcgen05 MMA Supported Types", + [Tcgen05MMAKindF8F6F4, Tcgen05MMAKindINT8, Tcgen05MMAKindF16, + Tcgen05MMAKindTF32]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMAKindAttr : EnumAttr { + let description = [{ + The Tcgen05MMAKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp} Op. The following are supported types for each kind: + + ``` + +-------------+--------------------------------------------+ + | Matrix Kind | supported types for A / B | + +-------------+--------------------------------------------+ + | f16 | f16, bf16 | + | tf32 | tf32 | + | f8f6f4 | e4m3, e5m2, e2m3, e3m2, e2m1 | + | i8 | unsigned 8b, signed 8b | + +-------------+--------------------------------------------+ + ``` + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05MMACollectorOpDiscard : I32EnumAttrCase<"DISCARD", 0, "discard">; +def Tcgen05MMACollectorOpLastUse : I32EnumAttrCase<"LASTUSE", 1, "lastuse">; +def Tcgen05MMACollectorOpFill : I32EnumAttrCase<"FILL", 2, "fill">; +def Tcgen05MMACollectorOpUse : I32EnumAttrCase<"USE", 3, "use">; + +def Tcgen05MMACollectorOp : I32EnumAttr< + "Tcgen05MMACollectorOp", + "tcgen05.mma Collector Buffer Operation", + [Tcgen05MMACollectorOpDiscard, + Tcgen05MMACollectorOpLastUse, + Tcgen05MMACollectorOpFill, + Tcgen05MMACollectorOpUse]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMACollectorOpAttr : EnumAttr { + let description = [{ + Tcgen05MMACollectorOp attribute specifies the collector buffer operations. + The following are the supported operations: + * discard : Release buffer after use (default) + * lastuse : Mark buffer for last use + * fill : Fill buffer + * use : Use buffer without modification + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma", + [AttrSizedOperandSegments, + NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma` operation is an asynchronous tensor core instruction that + performs matrix multiplication, accumulation in a single fused operation. It + targets 5th-generation tensor cores, providing developers with fine-grained + control over execution and scheduling. + + ``` + D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided + D = A * B // if `enableInputD` is false + D = A * B + D // otherwise + ``` + + where: + - A is an `M x K` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Optional Operands: + - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15] + + - `disableOutputLane` is a vector mask for selective output + * vector<4 x i32> when ctaGroup is CTA_1 + * vector<8 x i32> when ctaGroup is CTA_2 + + Required Attributes: + - `kind` is a Tcgen05MMAKind attribute + + - `ctaGroup` specifies CTA group configuration + * cta_1: MMA will be performed on the current thread's CTA + * cta_2: MMA will be performed on the current thread and it's peer CTA + + Default Attributes: + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer + + - `aShift` shifts the rows of the A matrix down by one row and can only be + applied if A is in tensor memory + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$collectorOp, + UnitAttr:$aShift, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + Optional:$scaleInputD, + Optional>:$disableOutputLane + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`scale` `=` $scaleInputD^)? + (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMAOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMASparseOp : NVVM_Op<"tcgen05.mma.sp", + [AttrSizedOperandSegments, + NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs MMA operation with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.sp` operation is an asynchronous tensor core instruction + that performs matrix multiplication, accumulation with sparse `A` matrix in + a single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution and scheduling. + + ``` + D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided + D = A * B // if `enableInputD` is false + D = A * B + D // otherwise + ``` + + where: + - A is an `M x (K / 2)` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + - sparseMetadata located in tensor memory specifies the mapping of the `K / 2` + non-zero elements to the K elements before performing the MMA operation + + Other attributes and operands are similar to that of tcgen05.mma Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$collectorOp, + UnitAttr:$aShift, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + Optional:$scaleInputD, + Optional>:$disableOutputLane + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMASparseOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def Tcgen05MMAKindMXF8F6F4 : I32EnumAttrCase<"MXF8F6F4", 0, "mxf8f6f4">; +def Tcgen05MMAKindMXF4 : I32EnumAttrCase<"MXF4", 1, "mxf4">; +def Tcgen05MMAKindMXF4NVF4 : I32EnumAttrCase<"MXF4NVF4", 2, "mxf4nvf4">; + +def Tcgen05MMABlockScaleKind : I32EnumAttr< + "Tcgen05MMABlockScaleKind", + "tcgen05.mma.block_scale supported types", + [Tcgen05MMAKindMXF8F6F4, Tcgen05MMAKindMXF4, Tcgen05MMAKindMXF4NVF4]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMABlockScaleKindAttr : EnumAttr { + let description = [{ + The Tcgen05MMABlockScaleKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp}.block_scale Op. The following are supported types for each kind: + + ``` + +--------------+-------------------------------------------+ + | Matrix Kind | supported types for A / B | + +--------------+-------------------------------------------+ + | mxf8f6f4 | e4m3, e5m3, e2m3, e3m2, e2m1 | + | mxf4 | e2m1 | + | mxf4nvf4 | e2m1 | + +--------------+-------------------------------------------+ + ``` + }]; + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05MMABlockScaleDefault : I32EnumAttrCase<"DEFAULT", 0, "default">; +def Tcgen05MMABlockScaleBlock16 : I32EnumAttrCase<"BLOCK16", 1, "block16">; +def Tcgen05MMABlockScaleBlock32 : I32EnumAttrCase<"BLOCK32", 2, "block32">; + +def Tcgen05MMABlockScale + : I32EnumAttr<"Tcgen05MMABlockScale", + "tcgen05.mma block scale attribute", + [Tcgen05MMABlockScaleDefault, Tcgen05MMABlockScaleBlock16, + Tcgen05MMABlockScaleBlock32]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMABlockScaleAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs block scaled MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.block_scale` operation is an asynchronous tensor core instruction + that performs matrix multiplication, accumulation with block scaling in a + single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution and scheduling. + + ``` + D = (A * scale_a) * (B * scale_b)` // if `enableInputD` is false + D = (A * scale_a) * (B * scale_b) + D` + ``` + + where: + - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor + - B is a K x N matrix described using shared memory descriptor + - D is an M x N accumulator matrix in tensor memory + - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Required Attributes: + - `kind` is a Tcgen05MMABlockScaleKind attribute + + - `ctaGroup` specifies CTA group configuration + * cta_1: MMA will be performed on the current thread's CTA + * cta_2: MMA will be performed on the current thread and it's peer CTA + + Default Attributes: + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma) + }]; + + let arguments = (ins + Tcgen05MMABlockScaleKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$blockScale, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, I1:$enableInputD, + LLVM_PointerTensor:$scaleA, + LLVM_PointerTensor:$scaleB + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $scaleA `,` $scaleB + attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMASparseBlockScaleOp : NVVM_Op<"tcgen05.mma.sp.block_scale", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs block scaled MMA operation with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.sp.block_scale` operation is an asynchronous tensor core + instruction that performs matrix multiplication, accumulation with block + scaling, and sparse `A` matrix in a single fused operation. It targets + 5th-generation tensor cores, providing developers with fine-grained control + over execution, and scheduling. + + ``` + D = (A * scale_a) * (B * scale_b) // if `enableInputD` is specified + D = (A * scale_a) * (B * scale_b) + D // otherwise + ``` + + where: + - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor + - B is a K x N matrix described using shared memory descriptor + - D is an M x N accumulator matrix in tensor memory + - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively + + Other attributes and operands are similar to that of tcgen05.mma.block_scale Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp) + }]; + + let arguments = (ins + Tcgen05MMABlockScaleKindAttr:$kind, + CTAGroupKindAttr:$ctaGroup, + DefaultValuedAttr:$blockScale, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + LLVM_PointerTensor:$scaleA, + LLVM_PointerTensor:$scaleB + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata `,` $scaleA `,` $scaleB + attr-dict `:` `(` type(operands) `)` + }]; + + let hasVerifier = true; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def Tcgen05MMACollectorBBuffer0 : I32EnumAttrCase<"B0", 0, "b0">; +def Tcgen05MMACollectorBBuffer1 : I32EnumAttrCase<"B1", 1, "b1">; +def Tcgen05MMACollectorBBuffer2 : I32EnumAttrCase<"B2", 2, "b2">; +def Tcgen05MMACollectorBBuffer3 : I32EnumAttrCase<"B3", 3, "b3">; + +def Tcgen05MMACollectorBBuffer : I32EnumAttr< + "Tcgen05MMACollectorBBuffer", + "tcgen05 MMA Collector Buffer B Attribute", + [Tcgen05MMACollectorBBuffer0, Tcgen05MMACollectorBBuffer1, Tcgen05MMACollectorBBuffer2, + Tcgen05MMACollectorBBuffer3]> { + let cppNamespace = "::mlir::NVVM"; + let genSpecializedAttr = 0; +} + +def Tcgen05MMACollectorBBufferAttr : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def NVVM_Tcgen05MMAWsOp : NVVM_Op<"tcgen05.mma.ws", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs weight stationary convolution MMA operation on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.ws` operation is an asynchronous tensor core instruction + that performs weight stationary convolution matrix multiplication, accumulation + in a single fused operation. It targets 5th-generation tensor cores, providing + developers with fine-grained control over execution, and scheduling. + + ``` + D = A * B` // if `enableInputD` is false + D = A * B + D` // otherwise + ``` + + where: + - A is an `M x K` matrix in tensor memory or described using shared memory descriptor + - B is a `K x N` matrix described using shared memory descriptor + - D is an `M x N` accumulator matrix in tensor memory + + The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op + + - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor) + + Optional Operands: + - zeroColMask is a 64 bit value representing the [Zero-column mask descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-zero-column-mask-descriptor) + + Required Attributes: + - `kind` is a Tcgen05MMAKind attribute + + Default Valued Attributes: + - collectorBBuffer specifies collector buffer for matrix B: b0 (default), b1, b2, b3 + + - collectorOp is a Tcgen05MMACollectorOp attribute with matrix B as the collector buffer + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + DefaultValuedAttr:$collectorBBuffer, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + Optional:$zeroColMask + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`,` $zeroColMask^)? + attr-dict `:` `(` type(operands) `)` + }]; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = + NVVM::Tcgen05MMAWsOp::getIntrinsicIDAndArgs(*op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + +def NVVM_Tcgen05MMAWsSparseOp : NVVM_Op<"tcgen05.mma.ws.sp", + [NVVMRequiresSMa<[100, 110]>]> { + let summary = "Performs weight stationary convolution MMA with sparse A matrix on 5th-gen tensor cores"; + + let description = [{ + The `tcgen05.mma.ws.sp` operation is an asynchronous tensor core instruction + that performs weight stationary convolution matrix multiplication, accumulation + with sparse `A` matrix in a single fused operation. It targets 5th-generation + tensor cores, providing developers with fine-grained control over execution, + and scheduling. + + ``` + D = A * B` // if `enableInputD` is false + D = A * B + D` // otherwise + ``` + + where: + - A is an M x (K / 2) matrix in memory or descriptor format + - B is a K x N matrix + - D is an M x N accumulator matrix + - sparseMetadata located in tensor memory specifies the mapping of the `K / 2` + non-zero elements to the K elements before performing the MMA operation + + Other attributes and operands are similar to that of tcgen05.mma.ws Op + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws-sp) + }]; + + let arguments = (ins + Tcgen05MMAKindAttr:$kind, + DefaultValuedAttr:$collectorBBuffer, + DefaultValuedAttr:$collectorOp, + LLVM_PointerTensor:$matrixD, + AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA, + I64:$matrixB, + I32:$idesc, + I1:$enableInputD, + LLVM_PointerTensor:$sparseMetadata, + Optional:$zeroColMask + ); + + let assemblyFormat = [{ + $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`,` $zeroColMask^)? attr-dict `:` `(` type(operands) `)` + }]; + + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder); + }]; + + let llvmBuilder = [{ + auto [ID, args] = NVVM::Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, ID, args); + }]; +} + //===----------------------------------------------------------------------===// // NVVM target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td index 054c13a88a552..6b0c84d31d1ba 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td @@ -44,4 +44,35 @@ def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface ]; } +def AddressOfGlobalOpInterface : OpInterface<"AddressOfGlobalOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that compute the address of a global variable + or symbol. + }]; + + let methods = [ + InterfaceMethod<"Get the symbol reference to the global", "::mlir::SymbolRefAttr", + "getSymbol", (ins)>, + ]; +} + +def GlobalVariableOpInterface : OpInterface<"GlobalVariableOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that define global variables. This interface + provides a uniform way to query properties of global variables across + different dialects. + }]; + + let methods = [ + InterfaceMethod<"Check if the global variable is constant", "bool", + "isConstant", (ins), [{ + return false; + }]>, + ]; +} + #endif // OPENACC_OPS_INTERFACES diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td index 970d9304d8289..cad78df2fbb0b 100644 --- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td @@ -97,7 +97,14 @@ def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> { "mlir::acc::DeviceType::None", "Target device type for implicit routine generation. " "Ensures that `acc routine` device_type clauses are " - "properly considered not just default clauses."> + "properly considered not just default clauses.", + [{::llvm::cl::values( + clEnumValN(mlir::acc::DeviceType::None, "none", "none"), + clEnumValN(mlir::acc::DeviceType::Host, "host", "host"), + clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"), + clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"), + clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon")) + }]> ]; } diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 4c67856b559b1..1ae1feed177ae 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -1282,12 +1282,6 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou let hasCanonicalizer = 1; } -def isSharedPred : CPred<"isSharedMemory(llvm::cast($_self))">; -class StaticShared1DMemRefOf allowedTypes> : - ConfinedType, [HasStaticShapePred, isSharedPred], - "statically shaped " # MemRefOf.summary # " for shared memory", - "mlir::MemRefType">; - class SizeInBits : StrFunc<"llvm::cast($" # name # ".getType()).getNumElements()" "*llvm::cast($" # name # ".getType()).getElementTypeBitWidth()">; @@ -1304,11 +1298,12 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure, as the underlying shared local memory. Arguments: - - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer. + - `source` : 1D or 2D statically shape memref, representing the raw SLM buffer. + The provided memref must be contiguous. Results: - `mem_desc` : the memory descriptor. }]; - let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source); + let arguments = (ins AnyTypeOf<[StaticShared1DMemRefOf<[XeGPU_ScalarType]>, StaticShared2DMemRefOf<[XeGPU_ScalarType]>]>:$source); let results = (outs XeGPU_MemDesc:$mem_desc); let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($mem_desc))"; } diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index b1196fbe9c66a..7f7f7d065c50e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -35,6 +35,17 @@ class XeGPUTypeDef traits = [], let mnemonic = typeMnemonic; } +def isSharedPred : CPred<"isSharedMemory(llvm::cast($_self))">; +class StaticShared1DMemRefOf allowedTypes> : + ConfinedType, [HasStaticShapePred, isSharedPred], + "reside in share memory and statically 1d shaped " # MemRefOf.summary # " ", + "mlir::MemRefType">; + +class StaticShared2DMemRefOf allowedTypes>: + ConfinedType, [HasStaticShapePred, isSharedPred], + "reside in share memory and statically 2d shaped " # MemRefOf.summary # " ", + "mlir::MemRefType">; + def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", [ShapedTypeInterface], "::mlir::TensorType"> { let summary = "TensorDesc describing regions of interested data."; diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h index 17c323a042ec2..724da009e70f1 100644 --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -31,23 +31,23 @@ class GreedyRewriteConfig; // Passes //===----------------------------------------------------------------------===// +#define GEN_PASS_DECL_BUBBLEDOWNMEMORYSPACECASTS +#define GEN_PASS_DECL_CSE #define GEN_PASS_DECL_CANONICALIZER +#define GEN_PASS_DECL_COMPOSITEFIXEDPOINTPASS #define GEN_PASS_DECL_CONTROLFLOWSINK -#define GEN_PASS_DECL_CSE -#define GEN_PASS_DECL_INLINER +#define GEN_PASS_DECL_GENERATERUNTIMEVERIFICATION #define GEN_PASS_DECL_LOOPINVARIANTCODEMOTION +#define GEN_PASS_DECL_INLINER #define GEN_PASS_DECL_MEM2REG #define GEN_PASS_DECL_PRINTIRPASS #define GEN_PASS_DECL_PRINTOPSTATS +#define GEN_PASS_DECL_SCCP #define GEN_PASS_DECL_SROA #define GEN_PASS_DECL_STRIPDEBUGINFO -#define GEN_PASS_DECL_SCCP #define GEN_PASS_DECL_SYMBOLDCE #define GEN_PASS_DECL_SYMBOLPRIVATIZE #define GEN_PASS_DECL_TOPOLOGICALSORT -#define GEN_PASS_DECL_COMPOSITEFIXEDPOINTPASS -#define GEN_PASS_DECL_BUBBLEDOWNMEMORYSPACECASTS -#define GEN_PASS_DECL_GENERATERUNTIMEVERIFICATION #include "mlir/Transforms/Passes.h.inc" /// Creates an instance of the Canonicalizer pass, configured with default diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index 705298f497d20..4588cf46a9d46 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -579,9 +579,6 @@ class LoadStoreToXeVMPattern : public OpConversionPattern { } }; -// Lower xegpu::CreateMemDescOp to memref::ViewOp. Since SLM access instructions -// on Xe2 and Xe3 operate on 32-bit or 64-bit units, all data types smaller than -// 32 bits will be converted to 32 bits. class CreateMemDescOpPattern final : public OpConversionPattern { public: @@ -590,16 +587,7 @@ class CreateMemDescOpPattern final matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto resTy = op.getMemDesc(); - - // Create the result MemRefType with the same shape, element type, and - // memory space - auto newResTy = getTypeConverter()->convertType(resTy); - - Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0); - auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy, - op.getSource(), zero, ValueRange()); - rewriter.replaceOp(op, viewOp); + rewriter.replaceOp(op, adaptor.getSource()); return success(); } }; @@ -619,7 +607,7 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern { auto loc = op.getLoc(); auto ctxt = rewriter.getContext(); - Value basePtrStruct = adaptor.getMemDesc(); + Value baseAddr32 = adaptor.getMemDesc(); Value mdescVal = op.getMemDesc(); // Load result or Store value Type can be vector or scalar. Value data; @@ -647,21 +635,14 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern { auto mdescTy = cast(mdescVal.getType()); - Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create( - rewriter, loc, basePtrStruct); - - // Convert base pointer (ptr) to i32 - Value basePtrI32 = arith::IndexCastUIOp::create( - rewriter, loc, rewriter.getI32Type(), basePtrLLVM); - Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets); linearOffset = arith::IndexCastUIOp::create( rewriter, loc, rewriter.getI32Type(), linearOffset); - basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset, - elemByteSize); + Value basePtrI32 = addOffsetToBaseAddr(rewriter, loc, baseAddr32, + linearOffset, elemByteSize); // convert base pointer (i32) to LLVM pointer type - basePtrLLVM = + Value basePtrLLVM = LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32); if (op.getSubgroupBlockIoAttr()) { @@ -1005,15 +986,14 @@ struct ConvertXeGPUToXeVMPass auto i32Type = IntegerType::get(&getContext(), 32); return VectorType::get(8, i32Type); }); - // Convert MemDescType into flattened MemRefType for SLM + // Convert MemDescType into i32 for SLM typeConverter.addConversion([&](xegpu::MemDescType type) -> Type { - Type elemTy = type.getElementType(); - int numElems = type.getNumElements(); - return MemRefType::get(numElems, elemTy, AffineMap(), 3); + return IntegerType::get(&getContext(), 32); }); typeConverter.addConversion([&](MemRefType type) -> Type { - // Convert MemRefType to i64 type. + if (type.getMemorySpaceAsInt() == 3) + return IntegerType::get(&getContext(), 32); return IntegerType::get(&getContext(), 64); }); diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 7ac427dbe3941..0e620737109b8 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/NVPTXAddrSpace.h" @@ -60,6 +61,18 @@ static bool isPtrInSharedCTASpace(mlir::Value ptr) { return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared); } +// Helper method to convert CtaGroupKind in NVVM Dialect to CtaGroupKind in LLVM +static llvm::nvvm::CTAGroupKind +getNVVMCtaGroupKind(NVVM::CTAGroupKind ctaGroup) { + switch (ctaGroup) { + case NVVM::CTAGroupKind::CTA_1: + return llvm::nvvm::CTAGroupKind::CG_1; + case NVVM::CTAGroupKind::CTA_2: + return llvm::nvvm::CTAGroupKind::CG_2; + } + llvm_unreachable("unsupported cta_group value"); +} + //===----------------------------------------------------------------------===// // Verifier methods //===----------------------------------------------------------------------===// @@ -3091,6 +3104,605 @@ NVVM::IDArgPair ClusterLaunchControlQueryCancelOp::getIntrinsicIDAndArgs( return {intrinsicID, args}; } +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair +Tcgen05MMAOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + const bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + + using EnableAShiftArray = std::array; + using CtaGroupArray = std::array; + using IsATensorArray = std::array; + using HasScaleInputDArray = std::array; + using HasDisableOutputLaneArray = std::array; + + // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift] + static constexpr HasDisableOutputLaneArray tcgen05MMAIDs = { + { // without diable output lane + {{// without scale input D + {{ + // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_ashift, + }}}, + }}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_scale_d, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_scale_d, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_ashift, + }}}}}}}, + // with disable output lane + {{ // without scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2, + notIntrinsic}}}, + {{// cg1 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift, + }, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift, + }}}}}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2, + notIntrinsic}}}, + // tensor + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift}, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift, + }}}}}}}}}; + + llvm::Value *ScaleInputD = mt.lookupValue(thisOp.getScaleInputD()); + bool hasScaleInputD = ScaleInputD != nullptr; + + llvm::Value *DisableOutputLane = + mt.lookupValue(thisOp.getDisableOutputLane()); + bool hasDisableOutputLane = DisableOutputLane != nullptr; + + const unsigned ctaGroup = + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())); + + llvm::Intrinsic::ID ID = + tcgen05MMAIDs[hasDisableOutputLane][hasScaleInputD][isATensor] + [ctaGroup - 1][thisOp.getAShift()]; + + assert(ID != notIntrinsic && "Invalid intrinsic for Tcgen05MMAOp."); + + if (hasScaleInputD) + args.push_back(ScaleInputD); + + if (hasDisableOutputLane) + args.push_back(DisableOutputLane); + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + + if (!hasDisableOutputLane) + args.push_back(builder.getInt32(ctaGroup)); + + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +static LogicalResult +verifyTcgen05MMAOp(bool isATensor, mlir::Value disableOutputLane, + NVVM::CTAGroupKind ctaGroup, bool hasAShift, + NVVM::Tcgen05MMACollectorOp collectorOp, Location loc) { + + if (disableOutputLane) { + mlir::VectorType disableOutputLaneType = + cast(disableOutputLane.getType()); + if ((ctaGroup == NVVM::CTAGroupKind::CTA_1 && + disableOutputLaneType.getNumElements() != 4) || + (ctaGroup == NVVM::CTAGroupKind::CTA_2 && + disableOutputLaneType.getNumElements() != 8)) + return emitError(loc) << "Disable Output Lane of length " + << disableOutputLaneType.getNumElements() + << " is incompatible with CtaGroupAttr"; + } + + if (hasAShift && !isATensor) + return emitError( + loc, "A-shift can be applied only when matrix A is in tensor memory"); + + if (hasAShift == true && (collectorOp == Tcgen05MMACollectorOp::FILL || + collectorOp == Tcgen05MMACollectorOp::USE)) + return emitError( + loc, "Cannot use collector buffer operation fill or use with ashift"); + + return success(); +} + +LogicalResult Tcgen05MMAOp::verify() { + return verifyTcgen05MMAOp(isa(getMatrixA().getType()), + getDisableOutputLane(), getCtaGroup(), getAShift(), + getCollectorOp(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.sp functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMASparseOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + + using EnableAShiftArray = std::array; + using CtaGroupArray = std::array; + using IsATensorArray = std::array; + using HasScaleInputDArray = std::array; + using HasDisableOutputLaneArray = std::array; + + // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift] + static constexpr HasDisableOutputLaneArray tcgen05MMASparseIDs = { + { // without diable output lane + {{// without scale input D + {{ + // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared, notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared, notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_ashift, + }}}, + }}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d, + notIntrinsic}, + // cg2 + {llvm::Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d, + notIntrinsic}}}, + {{// tensor + { + // cg1 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_ashift, + }, + { + // cg2 + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d, + llvm::Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_ashift, + }}}}}}}, + // with disable output lane + {{ // without scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2, + notIntrinsic}}}, + {{// cg1 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift, + }, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift, + }}}}}, + // with scale input D + {{ // shared + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1, + notIntrinsic}, + // cg2 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2, + notIntrinsic}}}, + // tensor + {{// cg1 + {llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift}, + // cg2 + { + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2, + llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift, + }}}}}}}}}; + + llvm::Value *ScaleInputD = mt.lookupValue(thisOp.getScaleInputD()); + bool hasScaleInputD = ScaleInputD != nullptr; + + llvm::Value *DisableOutputLane = + mt.lookupValue(thisOp.getDisableOutputLane()); + bool hasDisableOutputLane = DisableOutputLane != nullptr; + + unsigned ctaGroup = + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())); + + llvm::Intrinsic::ID ID = + tcgen05MMASparseIDs[hasDisableOutputLane][hasScaleInputD][isATensor] + [ctaGroup - 1][thisOp.getAShift()]; + + assert(ID != notIntrinsic && "Invalid intrinsic for Tcgen05MMASparseOp."); + + if (hasScaleInputD) + args.push_back(ScaleInputD); + + if (hasDisableOutputLane) + args.push_back(DisableOutputLane); + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + + if (!hasDisableOutputLane) + args.push_back(builder.getInt32(ctaGroup)); + + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +LogicalResult Tcgen05MMASparseOp::verify() { + return verifyTcgen05MMAOp(isa(getMatrixA().getType()), + getDisableOutputLane(), getCtaGroup(), getAShift(), + getCollectorOp(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.block_scale functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getScaleA())); + args.push_back(mt.lookupValue(thisOp.getScaleB())); + args.push_back(builder.getInt32( + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + auto kind = thisOp.getKind(); + auto blockScale = thisOp.getBlockScale(); + llvm::Intrinsic::ID ID = [&]() { + if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF8F6F4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf8f6f4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf8f6f4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf8f6f4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf8f6f4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor + ? llvm::Intrinsic::nvvm_tcgen05_mma_tensor_mxf4_block_scale + : llvm::Intrinsic::nvvm_tcgen05_mma_shared_mxf4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4NVF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4nvf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4nvf4_block_scale_block32; + + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_tensor_mxf4nvf4_block_scale_block16 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_shared_mxf4nvf4_block_scale_block16; + } + } + llvm_unreachable("Invalid tcgen05.mma.block_scale attributes"); + }(); + + return {ID, args}; +} + +static LogicalResult +verifyTcgen05MMABlockScaleOp(NVVM::Tcgen05MMACollectorOp collectorOp, + NVVM::Tcgen05MMABlockScaleKind kind, + NVVM::Tcgen05MMABlockScale blockScale, + Location loc) { + + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT && + kind == Tcgen05MMABlockScaleKind::MXF4NVF4) + return emitError(loc, "mxf4nvf4 requires block scale attribute"); + + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16 && + kind != Tcgen05MMABlockScaleKind::MXF4NVF4) + return emitError(loc, + llvm::formatv("{} kind does not support block16 attribute", + stringifyEnum(kind))); + + return success(); +} + +LogicalResult Tcgen05MMABlockScaleOp::verify() { + return verifyTcgen05MMABlockScaleOp(getCollectorOp(), getKind(), + getBlockScale(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.sp.block_scale functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + args.push_back(mt.lookupValue(thisOp.getScaleA())); + args.push_back(mt.lookupValue(thisOp.getScaleB())); + args.push_back(builder.getInt32( + static_cast(getNVVMCtaGroupKind(thisOp.getCtaGroup())))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + auto kind = thisOp.getKind(); + auto blockScale = thisOp.getBlockScale(); + llvm::Intrinsic::ID ID = [&]() { + if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF8F6F4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf8f6f4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf8f6f4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf8f6f4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf8f6f4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::DEFAULT) { + return isATensor ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4_block_scale + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4_block_scale; + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4_block_scale_block32; + } + } else if (kind == NVVM::Tcgen05MMABlockScaleKind::MXF4NVF4) { + if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK32) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4nvf4_block_scale_block32 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4nvf4_block_scale_block32; + + } else if (blockScale == NVVM::Tcgen05MMABlockScale::BLOCK16) { + return isATensor + ? llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_mxf4nvf4_block_scale_block16 + : llvm::Intrinsic:: + nvvm_tcgen05_mma_sp_shared_mxf4nvf4_block_scale_block16; + } + } + llvm_unreachable("Invalid tcgen05.mma.sp.block_scale attributes"); + }(); + + return {ID, args}; +} + +LogicalResult Tcgen05MMASparseBlockScaleOp::verify() { + return verifyTcgen05MMABlockScaleOp(getCollectorOp(), getKind(), + getBlockScale(), getLoc()); +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.ws functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMAWsOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + + mlir::Value ZeroColMask = thisOp.getZeroColMask(); + llvm::Intrinsic::ID ID = notIntrinsic; + if (ZeroColMask) { + args.push_back(mt.lookupValue(ZeroColMask)); + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_tensor_zero_col_mask + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_shared_zero_col_mask; + } else + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_tensor + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_shared; + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorBBuffer()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + +//===----------------------------------------------------------------------===// +// NVVM tcgen05.mma.ws.sp functions +//===----------------------------------------------------------------------===// + +mlir::NVVM::IDArgPair Tcgen05MMAWsSparseOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + + auto thisOp = cast(op); + llvm::SmallVector args; + + args.push_back(mt.lookupValue(thisOp.getMatrixD())); + + llvm::Value *A = mt.lookupValue(thisOp.getMatrixA()); + bool isATensor = isa(A->getType()); + args.push_back(A); + + args.push_back(mt.lookupValue(thisOp.getMatrixB())); + args.push_back(mt.lookupValue(thisOp.getIdesc())); + args.push_back(mt.lookupValue(thisOp.getEnableInputD())); + args.push_back(mt.lookupValue(thisOp.getSparseMetadata())); + + mlir::Value ZeroColMask = thisOp.getZeroColMask(); + llvm::Intrinsic::ID ID = notIntrinsic; + if (ZeroColMask) { + args.push_back(mt.lookupValue(ZeroColMask)); + ID = isATensor + ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_tensor_zero_col_mask + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_shared_zero_col_mask; + } else + ID = isATensor ? llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_tensor + : llvm::Intrinsic::nvvm_tcgen05_mma_ws_sp_shared; + + args.push_back(builder.getInt32(static_cast(thisOp.getKind()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorBBuffer()))); + args.push_back( + builder.getInt32(static_cast(thisOp.getCollectorOp()))); + + return {ID, args}; +} + //===----------------------------------------------------------------------===// // NVVMDialect initialization, type parsing, and registration. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 8c9c137b8aebb..5749e6ded73ba 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -211,6 +211,24 @@ struct LLVMPointerPointerLikeModel Type getElementType(Type pointer) const { return Type(); } }; +struct MemrefAddressOfGlobalModel + : public AddressOfGlobalOpInterface::ExternalModel< + MemrefAddressOfGlobalModel, memref::GetGlobalOp> { + SymbolRefAttr getSymbol(Operation *op) const { + auto getGlobalOp = cast(op); + return getGlobalOp.getNameAttr(); + } +}; + +struct MemrefGlobalVariableModel + : public GlobalVariableOpInterface::ExternalModel { + bool isConstant(Operation *op) const { + auto globalOp = cast(op); + return globalOp.getConstant(); + } +}; + /// Helper function for any of the times we need to modify an ArrayAttr based on /// a device type list. Returns a new ArrayAttr with all of the /// existingDeviceTypes, plus the effective new ones(or an added none if hte new @@ -302,6 +320,11 @@ void OpenACCDialect::initialize() { MemRefPointerLikeModel>(*getContext()); LLVM::LLVMPointerType::attachInterface( *getContext()); + + // Attach operation interfaces + memref::GetGlobalOp::attachInterface( + *getContext()); + memref::GlobalOp::attachInterface(*getContext()); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt index 76e9ddd5b2304..4aa5b4523bbe6 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt @@ -26,4 +26,5 @@ add_mlir_dialect_library(MLIRTosaTransforms MLIRPass MLIRTosaDialect MLIRTransformUtils + MLIRFuncTransforms ) diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir index d4cb493271d0d..ac95a1a5707ea 100644 --- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir +++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir @@ -4,8 +4,8 @@ gpu.module @test_kernel [#xevm.target] { // e.g. for mem_desc<32x32xf16, @strides=[1, 16]> // its memory layout tuple is (blocked shape = [1,1,32,32],strides=[1024,1024,32,1]) - //CHECK-LABEL: load_store_matrix_1 - gpu.func @load_store_matrix_1(%arg0: memref<4096xi8, 3>) -> f32 { + //CHECK-LABEL: load_store_matrix_plain + gpu.func @load_store_matrix_plain(%arg0: memref<4096xi8, 3>) -> f32 { %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32> //CHECK: %[[TID:.*]] = gpu.thread_id x @@ -26,12 +26,40 @@ gpu.module @test_kernel [#xevm.target] { gpu.return %1: f32 } + //CHECK-LABEL: load_store_matrix_plain_2d_input + gpu.func @load_store_matrix_plain_2d_input(%arg0: memref<8192xi8, 3>) -> f32 { + %c0 = arith.constant 0 : index + %view = memref.view %arg0[%c0][]: memref<8192xi8, 3> to memref<64x32xf32, 3> + + %subview = memref.subview %view[32, 0] [32, 32] [1, 1] : memref<64x32xf32, 3> to memref<32x32xf32, strided<[32, 1], offset: 1024>, 3> + + %0 = xegpu.create_mem_desc %subview : memref<32x32xf32, strided<[32, 1], offset: 1024>, 3> -> !xegpu.mem_desc<32x32xf32> + + //CHECK: %[[TID:.*]] = gpu.thread_id x + //CHECK: %[[C1:.*]] = arith.constant 1 : index + //CHECK: %[[MUL1:.*]] = arith.muli %[[TID]], %[[C1]] : index + //CHECK: %[[C4:.*]] = arith.constant 4 : i32 + //CHECK: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4]] : i32 + //CHECK: llvm.load {{.*}} : !llvm.ptr<3> -> f32 + + %tid_x = gpu.thread_id x + + %1 = xegpu.load_matrix %0[%c0, %tid_x]: !xegpu.mem_desc<32x32xf32>, index, index -> f32 + + //CHECK: llvm.store {{.*}}, {{.*}} : f32, !llvm.ptr<3> + + xegpu.store_matrix %1, %0[%c0, %tid_x]: f32, !xegpu.mem_desc<32x32xf32>, index, index + + gpu.return %1: f32 + } + + // e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 32]> // its memory layout tuple is ([2,4,16,16],[256,512,1,16]) - //CHECK-LABEL: load_store_matrix_2 - gpu.func @load_store_matrix_2(%arg0: memref<4096xi8, 3>) -> f16 { + //CHECK-LABEL: load_store_matrix_blocked_strided + gpu.func @load_store_matrix_blocked_strided(%arg0: memref<4096xi8, 3>) -> f16 { %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout> - //CHECK: %[[c0:.*]] = arith.constant 0 : index + //CHECK: %[[tid_x:.*]] = gpu.thread_id x //CHECK: %[[c13:.*]] = arith.constant 13 : index //CHECK: %[[c16:.*]] = arith.constant 16 : index @@ -39,7 +67,7 @@ gpu.module @test_kernel [#xevm.target] { //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c13]], %[[c16]] : index //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index - + //CHECK: %[[c0:.*]] = arith.constant 0 : index //CHECK: %[[c256:.*]] = arith.constant 256 : index //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index @@ -68,10 +96,11 @@ gpu.module @test_kernel [#xevm.target] { // e.g. for mem_desc<32x64xf16, @block=[16, 16]> // its memory layout tuple is ([2,4,16,16],[1024,256,16,1]) - //CHECK-LABEL: load_store_matrix_3 - gpu.func @load_store_matrix_3(%arg0: memref<4096xi8, 3>) -> f16 { - //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3> + //CHECK-LABEL: load_store_matrix_blocked_nostride + gpu.func @load_store_matrix_blocked_nostride(%arg0: memref<4096xi8, 3>) -> f16 { + + //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<4096xi8, 3> -> index + //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32 %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout> //CHECK: %[[tid_x:.*]] = gpu.thread_id x @@ -79,13 +108,12 @@ gpu.module @test_kernel [#xevm.target] { %tid_x = gpu.thread_id x %c19 = arith.constant 19: index - //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index - //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32 //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c19]], %[[c16]] : index //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c19]], %[[c16]] : index //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index + //CHECK: %[[c0:.*]] = arith.constant 0 : index //CHECK: %[[c1024:.*]] = arith.constant 1024 : index //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c1024]] : index //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index @@ -97,7 +125,6 @@ gpu.module @test_kernel [#xevm.target] { //CHECK: %[[c1:.*]] = arith.constant 1 : index //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c1]] : index //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index - //CHECK: %[[loaded:.*]] = llvm.load {{.*}} : !llvm.ptr<3> -> f16 %1 = xegpu.load_matrix %0[%c19, %tid_x]: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout>, index, index -> f16 @@ -110,19 +137,17 @@ gpu.module @test_kernel [#xevm.target] { // e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 16]> // its memory layout tuple is ([2,4,16,16],[256,512,1,16]) - //CHECK-LABEL: load_store_matrix_4 - gpu.func @load_store_matrix_4(%arg0: memref<4096xi8, 3>) -> vector<8xf16> { + //CHECK-LABEL: load_store_matrix_blocked_strided_return_vector + gpu.func @load_store_matrix_blocked_strided_return_vector(%arg0: memref<4096xi8, 3>) -> vector<8xf16> { %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout> - //CHECK: %[[c0:.*]] = arith.constant 0 : index //CHECK: %[[tid_x:.*]] = gpu.thread_id x - //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c16]], %[[c16]] : index //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c16]], %[[c16]] : index //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index - + //CHECK: %[[c0:.*]] = arith.constant 0 : index //CHECK: %[[c256:.*]] = arith.constant 256 : index //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index @@ -150,25 +175,23 @@ gpu.module @test_kernel [#xevm.target] { // e.g. for mem_desc<32x64xf16, @block=[16, 16]> // its memory layout tuple is ([2,4,16,16],[1024,256,16,1]) - //CHECK-LABEL: load_store_matrix_5 - gpu.func @load_store_matrix_5(%arg0: memref<4096xi8, 3>) -> vector<8xf16> { - //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3> - - %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout> - + //CHECK-LABEL: load_store_matrix_blocked_subgroupblockio + gpu.func @load_store_matrix_blocked_subgroupblockio(%arg0: memref<4096xi8, 3>) -> vector<8xf16> { + + //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<4096xi8, 3> -> index + //CHECK: %[[basePtrI32:.*]] = arith.index_castui %[[intptr]] : index to i32 + %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout> + //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[c48:.*]] = arith.constant 48 : index - %c16 = arith.constant 16 : index %c48 = arith.constant 48 : index - //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index - //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32 //CHECK: %[[offset0:.*]] = arith.divsi %[[c16]], %[[c16]] : index //CHECK: %[[offset1:.*]] = arith.remsi %[[c16]], %[[c16]] : index //CHECK: %[[offset2:.*]] = arith.divsi %[[c48]], %[[c16]] : index //CHECK: %[[offset3:.*]] = arith.remsi %[[c48]], %[[c16]] : index + //CHECK: %[[c0:.*]] = arith.constant 0 : index //CHECK: %[[c1024:.*]] = arith.constant 1024 : index //CHECK: %[[mul0:.*]] = arith.muli %[[offset0]], %[[c1024]] : index //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index @@ -183,7 +206,7 @@ gpu.module @test_kernel [#xevm.target] { //CHECK: %[[linearOffsetI64:.*]] = arith.index_castui %[[linearOffset]] : index to i32 //CHECK: %[[c2:.*]] = arith.constant 2 : i32 //CHECK: %[[byteOffset:.*]] = arith.muli %[[linearOffsetI64]], %[[c2]] : i32 - //CHECK: %[[finalPtr:.*]] = arith.addi %[[basePtrI64]], %[[byteOffset]] : i32 + //CHECK: %[[finalPtr:.*]] = arith.addi %[[basePtrI32]], %[[byteOffset]] : i32 //CHECK: %[[ptr:.*]] = llvm.inttoptr %[[finalPtr]] : i32 to !llvm.ptr<3> //CHECK: %[[loadedI16:.*]] = xevm.blockload %[[ptr]] : (!llvm.ptr<3>) -> vector<8xi16> //CHECK: %[[loaded:.*]] = vector.bitcast %[[loadedI16]] : vector<8xi16> to vector<8xf16> diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 92f353717ac59..67faa60f2835e 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -836,7 +836,7 @@ func.func @slice_attr_repeat_dim() { // ----- func.func @create_mem_desc_non_slm() { %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 1> - // expected-error@+1 {{operand #0 must be statically shaped memref of 8-bit signless integer values for shared memory}} + // expected-error@+1 {{operand #0 must be reside in share memory and statically 1d shaped memref }} %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 1> -> !xegpu.mem_desc<16x64xf16> return } diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 9b3829664108d..1e9738f44bb66 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -834,6 +834,27 @@ gpu.func @create_mem_desc_with_stride() { gpu.return } + +// CHECK-LABEL: gpu.func @create_mem_desc_from_2d_memref({{.*}}) { +gpu.func @create_mem_desc_from_2d_memref() { + //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<16x64xf16, 3> + //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[alloc]] : memref<16x64xf16, 3> -> !xegpu.mem_desc<16x64xf16> + %m = memref.alloca() {alignment = 1024} : memref<16x64xf16, 3> + %mem_desc = xegpu.create_mem_desc %m : memref<16x64xf16, 3> -> !xegpu.mem_desc<16x64xf16> + gpu.return +} + +// CHECK-LABEL: gpu.func @create_mem_desc_with_stride_from_2d_memref({{.*}}) { +gpu.func @create_mem_desc_with_stride_from_2d_memref() { + //CHECK: %[[ALLOC:.+]] = memref.alloca() {alignment = 1024 : i64} : memref<32x64xf16, 3> + //CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][16, 0] [16, 64] [1, 1] : memref<32x64xf16, 3> to memref<16x64xf16, strided<[64, 1], offset: 1024>, 3> + //CHECK: %{{.+}} = xegpu.create_mem_desc %[[SUBVIEW]] : memref<16x64xf16, strided<[64, 1], offset: 1024>, 3> -> !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> + %m = memref.alloca() {alignment = 1024} : memref<32x64xf16, 3> + %m_sub = memref.subview %m[16, 0][16, 64][1,1] : memref<32x64xf16, 3> to memref<16x64xf16, strided<[64, 1], offset: 1024>, 3> + %mem_desc = xegpu.create_mem_desc %m_sub : memref<16x64xf16, strided<[64, 1], offset: 1024>, 3> -> !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> + gpu.return +} + // CHECK: gpu.func @load_matrix([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) gpu.func @load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) { // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16> diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir new file mode 100644 index 0000000000000..db4574bfaf78f --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir new file mode 100644 index 0000000000000..a15c3fb73de9c --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir new file mode 100644 index 0000000000000..f46b35a910fd9 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir @@ -0,0 +1,119 @@ +// RUN: mlir-translate --mlir-to-llvmir -verify-diagnostics -split-input-file %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_shared_ashift +llvm.func @nvvm_tcgen05_mma_shared_ashift(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + // expected-error @below {{A-shift can be applied only when matrix A is in tensor memory}} + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, i64, i64, i32, i1) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_ashift +llvm.func @nvvm_tcgen05_mma_ashift(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + // expected-error @below {{Cannot use collector buffer operation fill or use with ashift}} + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4nvf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_mxf4nvf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>) { + // expected-error @below {{mxf4nvf4 requires block scale attribute}} + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_mxf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_mxf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>) { + // expected-error @below {{mxf4 kind does not support block16 attribute}} + nvvm.tcgen05.mma.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, ashift, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLanev4: vector<4 x i32>, %disableOutputLanev8: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Disable Output Lane of length 8 is incompatible with CtaGroupAttr}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLanev8 + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_sp_mma_shared_ashift +llvm.func @nvvm_tcgen05_sp_mma_shared_ashift(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{A-shift can be applied only when matrix A is in tensor memory}} + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_ashift +llvm.func @nvvm_tcgen05_mma_sp_ashift(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{Cannot use collector buffer operation fill or use with ashift}} + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{mxf4nvf4 requires block scale attribute}} + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} + +// ----- + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_default +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_default(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scalea: !llvm.ptr<6>, %scaleb: !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + // expected-error @below {{mxf4 kind does not support block16 attribute}} + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scalea, %scaleb + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, ashift, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir new file mode 100644 index 0000000000000..286df36730e77 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir @@ -0,0 +1,442 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_1 +llvm.func @nvvm_tcgen05_mma_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_2 +llvm.func @nvvm_tcgen05_mma_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir new file mode 100644 index 0000000000000..5c7eabee71b4e --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir new file mode 100644 index 0000000000000..3200411aee213 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir @@ -0,0 +1,229 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf8f6f4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_mxf4nvf4_block_scale_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %scale_a: !llvm.ptr<6>, %scale_b : !llvm.ptr<6>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, blockScale = #nvvm.tcgen05_mma_block_scale} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp.block_scale %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %scale_a, %scale_b + {kind = #nvvm.tcgen05_mma_block_scale_kind, ctaGroup = #nvvm.cta_group, blockScale = #nvvm.tcgen05_mma_block_scale, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, !llvm.ptr<6>, !llvm.ptr<6>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir new file mode 100644 index 0000000000000..96044cf669d63 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir @@ -0,0 +1,442 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir new file mode 100644 index 0000000000000..709beb0508bb8 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir @@ -0,0 +1,634 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_sp_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>, %spmetadata: !llvm.ptr<6>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir new file mode 100644 index 0000000000000..798e311778beb --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir @@ -0,0 +1,633 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_1 +llvm.func @nvvm_tcgen05_mma_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_cta_2 +llvm.func @nvvm_tcgen05_mma_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=discard */ i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=fill */ i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f16 */ i32 0, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=tf32 */ i32 1, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, /* kind=i8 */ i32 3, /* cta_group= */ i32 2, /* collector=use */ i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 0, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, i32 1, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane : vector<4 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 2, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i32 3, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, vector<8 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_1(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<4 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <4 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<4 x i32>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2 +llvm.func @nvvm_tcgen05_mma_scale_d_imm_disable_output_lane_cta_2(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %adesc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %disableOutputLane: vector<8 x i32>) { + + %scale_d_imm = llvm.mlir.constant(0:i64) : i64 + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 0) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 1) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop, aShift} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 2) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 0, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 0, <8 x i32> {{%[0-9]+}}, i32 1, i32 3) + nvvm.tcgen05.mma %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d scale = %scale_d_imm mask = %disableOutputLane + {kind = #nvvm.tcgen05_mma_kind, ctaGroup = #nvvm.cta_group, collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64, vector<8 x i32>) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir new file mode 100644 index 0000000000000..5f1aeb05888bd --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws +llvm.func @nvvm_tcgen05_mma_ws(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir new file mode 100644 index 0000000000000..e390e350090ad --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp +llvm.func @nvvm_tcgen05_mma_ws_sp(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_sp_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_desc: i64, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_desc, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, i64, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir new file mode 100644 index 0000000000000..f7ce5484803e9 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp +llvm.func @nvvm_tcgen05_mma_ws_sp(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_sp_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_sp_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %spmetadata: !llvm.ptr<6>, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws.sp %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %spmetadata, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, !llvm.ptr<6>, i64) + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir new file mode 100644 index 0000000000000..cecbb3fbd90af --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir @@ -0,0 +1,133 @@ +// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws +llvm.func @nvvm_tcgen05_mma_ws(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1) + + llvm.return +} + +// CHECK-LABEL: @nvvm_tcgen05_mma_ws_zero_col_mask +llvm.func @nvvm_tcgen05_mma_ws_zero_col_mask(%d_tmem : !llvm.ptr<6>, %a_tmem: !llvm.ptr<6>, %b_desc: i64, %idesc: i32, %enable_input_d: i1, %zero_col_mask: i64) { + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 0, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 0) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 0, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 1, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 2, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + // CHECK: call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) {{%[0-9]+}}, ptr addrspace(6) {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 {{%[0-9]+}}, i64 {{%[0-9]+}}, i32 3, i32 1, i32 1) + nvvm.tcgen05.mma.ws %d_tmem, %a_tmem, %b_desc, %idesc, %enable_input_d, %zero_col_mask + {kind = #nvvm.tcgen05_mma_kind, + collectorBBuffer = #nvvm.tcgen05_mma_collectorb, + collectorOp = #nvvm.tcgen05_mma_collectorop} : (!llvm.ptr<6>, !llvm.ptr<6>, i64, i32, i1, i64) + + llvm.return +} diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt index 177c8680b0040..c8c2bb96b0539 100644 --- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt +++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_unittest(MLIROpenACCTests OpenACCOpsTest.cpp + OpenACCOpsInterfacesTest.cpp OpenACCUtilsTest.cpp ) mlir_target_link_libraries(MLIROpenACCTests diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp new file mode 100644 index 0000000000000..261f5c513ea24 --- /dev/null +++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp @@ -0,0 +1,95 @@ +//===- OpenACCOpsInterfacesTest.cpp - Unit tests for OpenACC interfaces --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/OwningOpRef.h" +#include "gtest/gtest.h" + +using namespace mlir; +using namespace mlir::acc; + +//===----------------------------------------------------------------------===// +// Test Fixture +//===----------------------------------------------------------------------===// + +class OpenACCOpsInterfacesTest : public ::testing::Test { +protected: + OpenACCOpsInterfacesTest() + : context(), builder(&context), loc(UnknownLoc::get(&context)) { + context.loadDialect(); + } + + MLIRContext context; + OpBuilder builder; + Location loc; +}; + +//===----------------------------------------------------------------------===// +// GlobalVariableOpInterface Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceNonConstant) { + // Test that a non-constant global returns false for isConstant() + + auto memrefType = MemRefType::get({10}, builder.getF32Type()); + OwningOpRef globalOp = memref::GlobalOp::create( + builder, loc, + /*sym_name=*/builder.getStringAttr("mutable_global"), + /*sym_visibility=*/builder.getStringAttr("private"), + /*type=*/TypeAttr::get(memrefType), + /*initial_value=*/Attribute(), + /*constant=*/UnitAttr(), + /*alignment=*/IntegerAttr()); + + auto globalVarIface = + dyn_cast(globalOp->getOperation()); + ASSERT_TRUE(globalVarIface != nullptr); + EXPECT_FALSE(globalVarIface.isConstant()); +} + +TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceConstant) { + // Test that a constant global returns true for isConstant() + + auto memrefType = MemRefType::get({5}, builder.getI32Type()); + OwningOpRef constantGlobalOp = memref::GlobalOp::create( + builder, loc, + /*sym_name=*/builder.getStringAttr("constant_global"), + /*sym_visibility=*/builder.getStringAttr("public"), + /*type=*/TypeAttr::get(memrefType), + /*initial_value=*/Attribute(), + /*constant=*/builder.getUnitAttr(), + /*alignment=*/IntegerAttr()); + + auto globalVarIface = + dyn_cast(constantGlobalOp->getOperation()); + ASSERT_TRUE(globalVarIface != nullptr); + EXPECT_TRUE(globalVarIface.isConstant()); +} + +//===----------------------------------------------------------------------===// +// AddressOfGlobalOpInterface Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCOpsInterfacesTest, AddressOfGlobalOpInterfaceGetSymbol) { + // Test that getSymbol() returns the correct symbol reference + + auto memrefType = MemRefType::get({5}, builder.getI32Type()); + const auto *symbolName = "test_global_symbol"; + + OwningOpRef getGlobalOp = memref::GetGlobalOp::create( + builder, loc, memrefType, FlatSymbolRefAttr::get(&context, symbolName)); + + auto addrOfGlobalIface = + dyn_cast(getGlobalOp->getOperation()); + ASSERT_TRUE(addrOfGlobalIface != nullptr); + EXPECT_EQ(addrOfGlobalIface.getSymbol().getLeafReference(), symbolName); +} diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index d972402d03a7d..2989b6f400bd4 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -106,18 +106,18 @@ else() set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}") if (NOT MSVC) - set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) - set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++) + set(OPENMP_TEST_C_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang++) else() - set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) - set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) + set(OPENMP_TEST_C_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang.exe) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang++.exe) endif() # Check for flang if (NOT MSVC) - set(OPENMP_TEST_Fortran_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/flang) + set(OPENMP_TEST_Fortran_COMPILER ${LLVM_TOOLS_BINARY_DIR}/flang) else() - set(OPENMP_TEST_Fortran_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/flang.exe) + set(OPENMP_TEST_Fortran_COMPILER ${LLVM_TOOLS_BINARY_DIR}/flang.exe) endif() # Set fortran test compiler if flang is found diff --git a/offload/cmake/OpenMPTesting.cmake b/offload/cmake/OpenMPTesting.cmake index f8c892870a3d5..c9b3649e9a9cf 100644 --- a/offload/cmake/OpenMPTesting.cmake +++ b/offload/cmake/OpenMPTesting.cmake @@ -79,9 +79,9 @@ else() message(WARNING "The check targets will not be available!") set(ENABLE_CHECK_TARGETS FALSE) else() - set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck) + set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/FileCheck) endif() - set(OPENMP_NOT_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/not) + set(OPENMP_NOT_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/not) endif() set(OFFLOAD_DEVICE_INFO_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-offload-device-info) set(OFFLOAD_TBLGEN_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/offload-tblgen) diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index 00f4e2b74a5b0..c8ba45c9683e2 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -1,6 +1,6 @@ @AUTO_GEN_COMMENT@ -config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" +config.bin_llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@" config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@" diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index dde29cbed47fe..32a854d68f056 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -83,11 +83,11 @@ else() "Path where OpenMP config should be installed") if (NOT MSVC) - set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) - set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++) + set(OPENMP_TEST_C_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang++) else() - set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) - set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) + set(OPENMP_TEST_C_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang.exe) + set(OPENMP_TEST_CXX_COMPILER ${LLVM_TOOLS_BINARY_DIR}/clang++.exe) endif() # Set fortran test compiler if flang is found diff --git a/openmp/cmake/OpenMPTesting.cmake b/openmp/cmake/OpenMPTesting.cmake index 60280b7ed4893..262ea968c8351 100644 --- a/openmp/cmake/OpenMPTesting.cmake +++ b/openmp/cmake/OpenMPTesting.cmake @@ -68,9 +68,9 @@ else() message(WARNING "The check targets will not be available!") set(ENABLE_CHECK_TARGETS FALSE) else() - set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck) + set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/FileCheck) endif() - set(OPENMP_NOT_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/not) + set(OPENMP_NOT_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/not) endif() # Macro to extract information about compiler from file. (no own scope) diff --git a/orc-rt/docs/Design.md b/orc-rt/docs/Design.md new file mode 100644 index 0000000000000..45d42d53c809e --- /dev/null +++ b/orc-rt/docs/Design.md @@ -0,0 +1,119 @@ +# ORC Runtime Design + +The ORC runtime provides APIs for *executor* processes in an ORC JIT session +(as opposed to the LLVM ORC libraries which provide APIs for *controller* +processes). This includes support for both JIT'd code itself, and for users +of JIT'd code. + +## Background + +LLVM's On Request Compilation (ORC) APIs support cross-process loading of JIT'd +code. We call the process that defines and links the JIT'd code the *controller* +and the process that executes JIT'd code the *executor*. Controller processes +will link LLVM's ORC library, and construct a JIT'd program using an +llvm::orc::ExecutionSession instance (typically through an convenience wrapper +like llvm::orc::LLJIT). Executor processes construct an `orc_rt::Session` +object to manage resources for, and access to, JIT'd code within the executor +process. + +## APIs + +### Session + +The Session object is the root object for a JIT'd program. It owns the +ResourceManager instances that manage resources supporting JIT'd code (e.g. +JIT'd memory, unwind info registrations, dynamic library handles, etc.). + +The Session object must be constructed prior to adding any JIT'd code, and must +outlive execution of any JIT'd code. + +An executor may have more than one Session object, in which case each Session +object must outlive execution of any JIT'd code added to that specific session. + +### ControllerAccess + +ControllerAccess objects support bidirectional RPC between JIT'd code in the +executor and the ExecutionSession in the controller. + +Calls in both directions are to "wrapper functions" with a fixed signature (a +function that takes a blob of bytes and returns a blob of bytes as its result). +ControllerAccess objects can not generally assume anything about the format of +the bytes being sent (their interpretation is up to the called function). The +RPC is not fully symmetric: Calls from the controller to the executor specify +wrapper function *addresses* (i.e. the controller can invoke any code in the +executor). Calls from the executor to the controller specify *tags*, which are +addresses in the executor processes that are associated with handlers in the +controller. This ensures that the executing process can only call deliberately +exposed entry points in the controller. + +ControllerAccess objects may be detached before the session ends, at which point +JIT'd code may continue executing, but will receive no further calls from the +controller and can make no further calls to the controller. + +### ResourceManager + +`ResourceManager` is an interface for classes that manage resources that support +a JIT'd program, for example memory or loaded dylib handles. It provides two +operations: `detach` and `shutdown`. The `shutdown` operation will be called at +`Session` destruction time. The `detach` operation may be called if the +controller detaches: since this means that no further requests for resource +allocation or release will occur prior to the end of the Session +ResourceManagers may implement this operation to abandon any fine-grained +tracking or pre-reserved resources (e.g. address space). + +### TaskDispatcher + +Runs Tasks within the ORC runtime. In particular, calls originating from the +controller (via ControllerAccess) will be dispatched as Tasks. + +TaskDispatchers are responsible for ensuring that all dispatched Tasks have +completed or been destroyed during Session shutdown. + +### WrapperFunction + +A wrapper function is any function with the following C signature: + +```c +void (orc_rt_SessionRef Session, uint64_t CallId, + orc_rt_WrapperFunctionReturn Return, + orc_rt_WrapperFunctionBuffer ArgBytes); +``` + +where `orc_rt_WrapperFunctionReturn` and `orc_rt_WrapperFunctionBuffer` are +defined as: + +```c +typedef struct { + orc_rt_WrapperFunctionBufferDataUnion Data; + size_t Size; +} orc_rt_WrapperFunctionBuffer; + +/** + * Asynchronous return function for an orc-rt wrapper function. + */ +typedef void (*orc_rt_WrapperFunctionReturn)( + orc_rt_SessionRef Session, uint64_t CallId, + orc_rt_WrapperFunctionBuffer ResultBytes); +``` + +The orc_rt::WrapperFunction class provides APIs for implementing and calling +wrapper functions. + +### SPSWrapperFunction + +An SPS wrapper function is a wrapper function that uses the +SimplePackedSerialization scheme (see documentation in +orc-rt/include/orc-rt/SimplePackedSerialization.h). + +## TODO: + +Document... + +* C API +* Error handling +* RTTI +* ExecutorAddr / ExecutorAddrRange +* SimpleNativeMemoryMap +* Memory Access (unimplemented) +* Platform classes (unimplemented) +* Other utilities diff --git a/orc-rt/include/orc-rt/Session.h b/orc-rt/include/orc-rt/Session.h index 367cdb9a97b62..529aac6a2fadd 100644 --- a/orc-rt/include/orc-rt/Session.h +++ b/orc-rt/include/orc-rt/Session.h @@ -75,21 +75,22 @@ class Session { } private: - void shutdownNext(Error Err, - std::vector> RemainingRMs); - + struct ShutdownInfo { + bool Complete = false; + std::condition_variable CompleteCV; + std::vector> ResourceMgrs; + std::vector OnCompletes; + }; + + void shutdownNext(Error Err); void shutdownComplete(); std::unique_ptr Dispatcher; ErrorReporterFn ReportError; - enum class SessionState { Running, ShuttingDown, Shutdown }; - std::mutex M; - SessionState State = SessionState::Running; - std::condition_variable StateCV; std::vector> ResourceMgrs; - std::vector ShutdownCallbacks; + std::unique_ptr SI; }; inline orc_rt_SessionRef wrap(Session *S) noexcept { diff --git a/orc-rt/lib/executor/Session.cpp b/orc-rt/lib/executor/Session.cpp index fafa13b1cbb08..3ee9bad60c5b9 100644 --- a/orc-rt/lib/executor/Session.cpp +++ b/orc-rt/lib/executor/Session.cpp @@ -17,66 +17,59 @@ namespace orc_rt { Session::~Session() { waitForShutdown(); } void Session::shutdown(OnShutdownCompleteFn OnShutdownComplete) { - std::vector> ToShutdown; - { std::scoped_lock Lock(M); - ShutdownCallbacks.push_back(std::move(OnShutdownComplete)); - - // If somebody else has already called shutdown then there's nothing further - // for us to do here. - if (State >= SessionState::ShuttingDown) + if (SI) { + SI->OnCompletes.push_back(std::move(OnShutdownComplete)); return; + } - State = SessionState::ShuttingDown; - std::swap(ResourceMgrs, ToShutdown); + SI = std::make_unique(); + SI->OnCompletes.push_back(std::move(OnShutdownComplete)); + std::swap(SI->ResourceMgrs, ResourceMgrs); } - shutdownNext(Error::success(), std::move(ToShutdown)); + shutdownNext(Error::success()); } void Session::waitForShutdown() { shutdown([]() {}); std::unique_lock Lock(M); - StateCV.wait(Lock, [&]() { return State == SessionState::Shutdown; }); + SI->CompleteCV.wait(Lock, [&]() { return SI->Complete; }); } -void Session::shutdownNext( - Error Err, std::vector> RemainingRMs) { +void Session::shutdownNext(Error Err) { if (Err) reportError(std::move(Err)); - if (RemainingRMs.empty()) + if (SI->ResourceMgrs.empty()) return shutdownComplete(); - auto NextRM = std::move(RemainingRMs.back()); - RemainingRMs.pop_back(); - NextRM->shutdown( - [this, RemainingRMs = std::move(RemainingRMs)](Error Err) mutable { - shutdownNext(std::move(Err), std::move(RemainingRMs)); - }); + // Get the next ResourceManager to shut down. + auto NextRM = std::move(SI->ResourceMgrs.back()); + SI->ResourceMgrs.pop_back(); + NextRM->shutdown([this](Error Err) { shutdownNext(std::move(Err)); }); } void Session::shutdownComplete() { std::unique_ptr TmpDispatcher; - std::vector TmpShutdownCallbacks; { std::lock_guard Lock(M); TmpDispatcher = std::move(Dispatcher); - TmpShutdownCallbacks = std::move(ShutdownCallbacks); } TmpDispatcher->shutdown(); - for (auto &OnShutdownComplete : TmpShutdownCallbacks) + for (auto &OnShutdownComplete : SI->OnCompletes) OnShutdownComplete(); { std::lock_guard Lock(M); - State = SessionState::Shutdown; + SI->Complete = true; } - StateCV.notify_all(); + + SI->CompleteCV.notify_all(); } } // namespace orc_rt diff --git a/orc-rt/lib/executor/TaskDispatcher.cpp b/orc-rt/lib/executor/TaskDispatcher.cpp index 5f34627fb5150..9e42a66c2ea94 100644 --- a/orc-rt/lib/executor/TaskDispatcher.cpp +++ b/orc-rt/lib/executor/TaskDispatcher.cpp @@ -1,4 +1,4 @@ -//===- TaskDispatch.cpp ---------------------------------------------------===// +//===- TaskDispatcher.cpp -------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Contains the implementation of APIs in the orc-rt/TaskDispatch.h header. +// Contains the implementation of APIs in the orc-rt/TaskDispatcher.h header. // //===----------------------------------------------------------------------===// diff --git a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp index d6d301302220d..4bf7e5df69654 100644 --- a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp +++ b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp @@ -1,4 +1,4 @@ -//===- ThreadPoolTaskDispatch.cpp -----------------------------------------===// +//===- ThreadPoolTaskDispatcher.cpp ---------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatch.h +// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatcher.h // header. // //===----------------------------------------------------------------------===// diff --git a/revert_patches.txt b/revert_patches.txt index 9e465ba90ae6a..a3a76b6ac1e40 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -5,3 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485) breaks build of ROCmValidationSuite [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662) --- +Shore will help land downstream +[AMDGPU] Adding instruction specific features (#167809) +--- diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt index 007118d9c0d18..5220b9353fed7 100644 --- a/runtimes/CMakeLists.txt +++ b/runtimes/CMakeLists.txt @@ -77,11 +77,6 @@ if (NOT LLVM_FOUND) set(LLVM_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}) endif() -# Setting these variables will allow the sub-build to put their outputs into -# the library and bin directories of the top-level build. -set(LLVM_LIBRARY_OUTPUT_INTDIR ${LLVM_LIBRARY_DIR}) -set(LLVM_RUNTIME_OUTPUT_INTDIR ${LLVM_TOOLS_BINARY_DIR}) - # This variable makes sure that e.g. llvm-lit is found. set(LLVM_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../llvm) set(LLVM_CMAKE_DIR ${LLVM_MAIN_SRC_DIR}/cmake/modules) @@ -95,10 +90,21 @@ include(CheckCXXCompilerFlag) # Determine whether we are in the runtimes/runtimes-bins directory of a # bootstrap build. set(LLVM_TREE_AVAILABLE OFF) -if (LLVM_LIBRARY_OUTPUT_INTDIR AND LLVM_RUNTIME_OUTPUT_INTDIR AND PACKAGE_VERSION) +if (LLVM_LIBRARY_DIR AND LLVM_TOOLS_BINARY_DIR AND PACKAGE_VERSION) set(LLVM_TREE_AVAILABLE ON) endif() +if(LLVM_TREE_AVAILABLE) + # Setting these variables will allow the sub-build to put their outputs into + # the library and bin directories of the top-level build. + set(LLVM_LIBRARY_OUTPUT_INTDIR ${LLVM_LIBRARY_DIR}) + set(LLVM_RUNTIME_OUTPUT_INTDIR ${LLVM_TOOLS_BINARY_DIR}) +else() + # Use own build directory for artifact output. + set(LLVM_LIBRARY_OUTPUT_INTDIR "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX}") + set(LLVM_RUNTIME_OUTPUT_INTDIR "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin") +endif() + # CMake omits default compiler include paths, but in runtimes build, we use # -nostdinc and -nostdinc++ and control include paths manually so this behavior # is undesirable. Filtering CMAKE_{LANG}_IMPLICIT_INCLUDE_DIRECTORIES to remove diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index b027d82d98177..c936670168b94 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2874,6 +2874,10 @@ llvm_target_lib_list = [lib for lib in [ ["-gen-subtarget"], "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc", ), + ( + ["-gen-sd-node-info"], + "lib/Target/NVPTX/NVPTXGenSDNodeInfo.inc", + ), ], }, { @@ -4554,7 +4558,7 @@ cc_binary( "-Wl,--export-dynamic-symbol=__gxx_personality_v0", "-Wl,--export-dynamic-symbol=__cxa_allocate_exception", "-Wl,--export-dynamic-symbol=__cxa_throw", - "-Wl,--export-dynamic-symbol=llvm_orc_registerJITLoaderGDBWrapper", + "-Wl,--export-dynamic-symbol=llvm_orc_registerJITLoaderGDBAllocAction", "-Wl,--export-dynamic-symbol=llvm_orc_registerEHFrameSectionWrapper", "-Wl,--export-dynamic-symbol=llvm_orc_deregisterEHFrameSectionWrapper", ], @@ -5104,7 +5108,7 @@ cc_binary( "-Wl,--export-dynamic-symbol=__gxx_personality_v0", "-Wl,--export-dynamic-symbol=__cxa_allocate_exception", "-Wl,--export-dynamic-symbol=__cxa_throw", - "-Wl,--export-dynamic-symbol=llvm_orc_registerJITLoaderGDBWrapper", + "-Wl,--export-dynamic-symbol=llvm_orc_registerJITLoaderGDBAllocAction", ], }), stamp = 0, diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel index bfdf37b0c969b..0ecd704fe0a62 100644 --- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel @@ -408,7 +408,7 @@ cc_test( "@platforms//os:macos": [], "@platforms//os:windows": [], "//conditions:default": [ - "-Wl,--export-dynamic-symbol=llvm_orc_registerJITLoaderGDBWrapper", + "-Wl,--export-dynamic-symbol=llvm_orc_registerJITLoaderGDBAllocAction", ], }), deps = [