Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
4cd8c11
[X86] Replace default _mm512_sqrt_pd/s/h implementations with generic…
RKSimon Nov 15, 2025
76c69ca
[ValueTracking] Bail out on non-immediate constant expressions (#168084)
dtcxzyw Nov 15, 2025
5613e4a
[mlir][emitc] Fix ineffective tests (#168197)
aniragil Nov 15, 2025
820daa5
[VPlan] Support VPWidenIntOrFpInduction in getSCEVExprForVPValue. (NFCI)
fhahn Nov 15, 2025
ea66d26
HIP non-RDC: enable new offload driver on Windows via linker wrapper …
yxsamliu Nov 15, 2025
20db716
[ValueTracking] Only check up to CtxIter in willNotFreeBetween.
fhahn Nov 15, 2025
8f6c7aa
[X86] Remove vector length (256 vs 512) distinction of AVX10 (#167736)
mikolaj-pirog Nov 15, 2025
9fa15ef
[AMDGPU] When shrinking and/or to bitset*, remove implicit scc def (#…
LU-JOHN Nov 15, 2025
d99c840
[llvm-pdbutil] Create DBI section headers in yaml2pdb (#166566)
Nerixyz Nov 15, 2025
59d2e93
[LV] Add test with to check different interleave counts for fmaxnum.
fhahn Nov 15, 2025
636e370
[Utils] Remove an unused local variable (NFC) (#168181)
kazutakahirata Nov 15, 2025
7a8237b
[llvm] Use llvm::copy (NFC) (#168182)
kazutakahirata Nov 15, 2025
3a7876d
[llvm] Delete pointers without null checks (NFC) (#168183)
kazutakahirata Nov 15, 2025
268ea1a
[Analysis] Remove a redundant cast (NFC) (#168184)
kazutakahirata Nov 15, 2025
63e059d
[llvm] Proofread *.rst (#168185)
kazutakahirata Nov 15, 2025
67f61df
[VPlan] Always set trip count when creating plan for unit tests (NFC).
fhahn Nov 15, 2025
33a7bb1
DAG: Use poison when legalizing scalar_to_vector results (#167751)
arsenm Nov 15, 2025
70349c1
DAG: Use poison in SplitVecRes_VP_LOAD_FF (#167753)
arsenm Nov 15, 2025
a4e7d15
[MLIR][Python] Add tests for nvvm barrier ops (#167976)
ashermancinelli Nov 15, 2025
358e9a5
[LP] Assign weights when peeling last iteration. (#166858)
mtrofin Nov 15, 2025
82214ff
[clang-doc] add throws comments to comment template (#150649)
evelez7 Nov 15, 2025
eb9d56c
[MLIR][Transform][Python] Expose applying named_sequences as a method…
rolfmorel Nov 15, 2025
b1b0be2
[ADT] Make DenseMapBase::moveFrom safer (NFC) (#168180)
kazutakahirata Nov 15, 2025
ff8ed4d
[mlir] Use llvm::copy (NFC) (#168213)
kazutakahirata Nov 15, 2025
d343913
[Analysis] Remove a redundant cast. (#168214)
kazutakahirata Nov 15, 2025
3705921
[CodeGen] add a command to force global merge
Zhenhang1213 Nov 15, 2025
700aa5e
[revert][CodeGen] add a command to force global merge (#168230)
Zhenhang1213 Nov 15, 2025
6b4fef0
[CI] Fix typo in CI Best Practices for the release branch names push …
nightlark Nov 15, 2025
1fd9c02
[mlir] Adopt cast function objects. NFC. (#168228)
kuhar Nov 15, 2025
95c93f4
Cleanups in AArch64 (#168025)
echristo Nov 15, 2025
f210fc1
[Clang] Add __builtin_bswapg (#162433)
clingfei Nov 15, 2025
f5b7376
[mlir][MemRef] Add UB as a dependent dialect and use `ub.poison` for …
fabianmcg Nov 15, 2025
d831f8d
[SelectionDAG] Fix AArch64 machine verifier bug when expanding LOOP_D…
AZero13 Nov 15, 2025
e009de2
[LV] Use VPlan pattern matching in adjustRecipesForReductions (NFC)
fhahn Nov 15, 2025
6cedafb
X86: Handle poison in buildFromShuffleMostly (#168218)
arsenm Nov 15, 2025
019e90f
[ADT] Group public functions in DenseMap.h (NFC) (#168239)
kazutakahirata Nov 15, 2025
edbf9e4
[mlir] Remove a redundant cast (NFC) (#168241)
kazutakahirata Nov 15, 2025
810ebe6
merge main into amd-staging
ronlieb Nov 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions clang-tools-extra/clang-doc/JSONGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ static Object serializeComment(const CommentInfo &I, Object &Description) {
insertComment(Description, TextCommentsArray, "BriefComments");
else if (I.Name == "return")
insertComment(Description, TextCommentsArray, "ReturnComments");
else if (I.Name == "throws" || I.Name == "throw") {
json::Value ThrowsVal = Object();
auto &ThrowsObj = *ThrowsVal.getAsObject();
ThrowsObj["Exception"] = I.Args.front();
ThrowsObj["Children"] = TextCommentsArray;
insertComment(Description, ThrowsVal, "ThrowsComments");
}
return Obj;
}

Expand Down
8 changes: 8 additions & 0 deletions clang-tools-extra/clang-doc/assets/comment-template.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@
</div>
{{/CodeComments}}
{{/HasCodeComments}}
{{#HasThrowsComments}}
<h3>Throws</h3>
{{#ThrowsComments}}
<div>
<b>{{Exception}}</b> {{#Children}}{{TextComment}}{{/Children}}
</div>
{{/ThrowsComments}}
{{/HasThrowsComments}}
{{#BlockCommandComment}}
<div class="block-command-comment__command">
<div class="block-command-command">
Expand Down
5 changes: 4 additions & 1 deletion clang-tools-extra/test/clang-doc/basic-project.mustache.test
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,10 @@ HTML-CALC: </div>
HTML-CALC: <h3>Returns</h3>
HTML-CALC: <p> double The result of a / b.</p>
HTML-CALC: <p></p>
HTML-CALC: </div>
HTML-CALC: <h3>Throws</h3>
HTML-CALC: <div>
HTML-CALC: <b>std::invalid_argument</b> if b is zero.
HTML-CALC: </div>
HTML-CALC: </div>
HTML-CALC: </div>
HTML-CALC: <div class="delimiter-container">
Expand Down
4 changes: 4 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ Non-comprehensive list of changes in this release
allocator-level heap organization strategies. A feature to instrument all
allocation functions with a token ID can be enabled via the
``-fsanitize=alloc-token`` flag.

- A new generic byte swap builtin function ``__builtin_bswapg`` that extends the existing
__builtin_bswap{16,32,64} function family to support all standard integer types.

- A builtin ``__builtin_infer_alloc_token(<args>, ...)`` is provided to allow
compile-time querying of allocation token IDs, where the builtin arguments
mirror those normally passed to an allocation function.
Expand Down
6 changes: 6 additions & 0 deletions clang/include/clang/Basic/Builtins.td
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,12 @@ def BSwap : Builtin, Template<["unsigned short", "uint32_t", "uint64_t"],
let Prototype = "T(T)";
}

def BSwapg : Builtin {
let Spellings = ["__builtin_bswapg"];
let Attributes = [NoThrow, Const, Constexpr, CustomTypeChecking];
let Prototype = "int(...)";
}

def Bitreverse : BitInt8_16_32_64BuiltinsTemplate, Builtin {
let Spellings = ["__builtin_bitreverse"];
let Attributes = [NoThrow, Const, Constexpr];
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -12967,6 +12967,9 @@ def err_builtin_invalid_arg_type: Error<
"%plural{0:|: }3"
"%plural{[0,3]:type|:types}1 (was %4)">;

def err_bswapg_invalid_bit_width : Error<
"_BitInt type %0 (%1 bits) must be a multiple of 16 bits for byte swapping">;

def err_builtin_trivially_relocate_invalid_arg_type: Error <
"first%select{||| and second}0 argument%select{|||s}0 to "
"'__builtin_trivially_relocate' must be"
Expand Down
9 changes: 5 additions & 4 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,9 +972,10 @@ static bool interp__builtin_bswap(InterpState &S, CodePtr OpPC,
const InterpFrame *Frame,
const CallExpr *Call) {
const APSInt &Val = popToAPSInt(S, Call->getArg(0));
assert(Val.getActiveBits() <= 64);

pushInteger(S, Val.byteSwap(), Call->getType());
if (Val.getBitWidth() == 8)
pushInteger(S, Val, Call->getType());
else
pushInteger(S, Val.byteSwap(), Call->getType());
return true;
}

Expand Down Expand Up @@ -3687,7 +3688,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case Builtin::BI__builtin_elementwise_ctzg:
return interp__builtin_elementwise_countzeroes(S, OpPC, Frame, Call,
BuiltinID);

case Builtin::BI__builtin_bswapg:
case Builtin::BI__builtin_bswap16:
case Builtin::BI__builtin_bswap32:
case Builtin::BI__builtin_bswap64:
Expand Down
4 changes: 3 additions & 1 deletion clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15306,13 +15306,15 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,

return Success(Val.reverseBits(), E);
}

case Builtin::BI__builtin_bswapg:
case Builtin::BI__builtin_bswap16:
case Builtin::BI__builtin_bswap32:
case Builtin::BI__builtin_bswap64: {
APSInt Val;
if (!EvaluateInteger(E->getArg(0), Val, Info))
return false;
if (Val.getBitWidth() == 8)
return Success(Val, E);

return Success(Val.byteSwap(), E);
}
Expand Down
17 changes: 8 additions & 9 deletions clang/lib/Analysis/BodyFarm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,15 +293,14 @@ static CallExpr *create_call_once_lambda_call(ASTContext &C, ASTMaker M,
FunctionDecl *callOperatorDecl = CallbackDecl->getLambdaCallOperator();
assert(callOperatorDecl != nullptr);

DeclRefExpr *callOperatorDeclRef =
DeclRefExpr::Create(/* Ctx =*/ C,
/* QualifierLoc =*/ NestedNameSpecifierLoc(),
/* TemplateKWLoc =*/ SourceLocation(),
const_cast<FunctionDecl *>(callOperatorDecl),
/* RefersToEnclosingVariableOrCapture=*/ false,
/* NameLoc =*/ SourceLocation(),
/* T =*/ callOperatorDecl->getType(),
/* VK =*/ VK_LValue);
DeclRefExpr *callOperatorDeclRef = DeclRefExpr::Create(
/* Ctx =*/C,
/* QualifierLoc =*/NestedNameSpecifierLoc(),
/* TemplateKWLoc =*/SourceLocation(), callOperatorDecl,
/* RefersToEnclosingVariableOrCapture=*/false,
/* NameLoc =*/SourceLocation(),
/* T =*/callOperatorDecl->getType(),
/* VK =*/VK_LValue);

return CXXOperatorCallExpr::Create(
/*AstContext=*/C, OO_Call, callOperatorDeclRef,
Expand Down
13 changes: 13 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3618,6 +3618,19 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
Builder.CreateArithmeticFence(ArgValue, ConvertType(ArgType)));
return RValue::get(ArgValue);
}
case Builtin::BI__builtin_bswapg: {
Value *ArgValue = EmitScalarExpr(E->getArg(0));
llvm::IntegerType *IntTy = cast<llvm::IntegerType>(ArgValue->getType());
assert(IntTy && "LLVM's __builtin_bswapg only supports integer variants");
assert(((IntTy->getBitWidth() % 16 == 0 && IntTy->getBitWidth() != 0) ||
IntTy->getBitWidth() == 8) &&
"LLVM's __builtin_bswapg only supports integer variants that has a "
"multiple of 16 bits as well as a single byte");
if (IntTy->getBitWidth() == 8)
return RValue::get(ArgValue);
return RValue::get(
emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::bswap));
}
case Builtin::BI__builtin_bswap16:
case Builtin::BI__builtin_bswap32:
case Builtin::BI__builtin_bswap64:
Expand Down
3 changes: 1 addition & 2 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
return nullptr;
}
if (CGM.getLangOpts().OffloadViaLLVM ||
(CGM.getLangOpts().OffloadingNewDriver &&
(CGM.getLangOpts().HIP || RelocatableDeviceCode)))
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
createOffloadingEntries();
else
return makeModuleCtorFunction();
Expand Down
46 changes: 26 additions & 20 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4427,10 +4427,6 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda));

bool HIPNoRDC =
C.isOffloadingHostKind(Action::OFK_HIP) &&
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);

// Builder to be used to build offloading actions.
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
!UseNewOffloadingDriver
Expand Down Expand Up @@ -4569,7 +4565,7 @@ void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
// Check if this Linker Job should emit a static library.
if (ShouldEmitStaticLibrary(Args)) {
LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
} else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
} else if (UseNewOffloadingDriver ||
Args.hasArg(options::OPT_offload_link)) {
LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
Expand Down Expand Up @@ -4906,20 +4902,6 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
<< "-fhip-emit-relocatable"
<< "--offload-device-only";

// For HIP non-rdc non-device-only compilation, create a linker wrapper
// action for each host object to link, bundle and wrap device files in
// it.
if ((isa<AssembleJobAction>(HostAction) ||
(isa<BackendJobAction>(HostAction) &&
HostAction->getType() == types::TY_LTO_BC)) &&
HIPNoRDC && !offloadDeviceOnly()) {
ActionList AL{HostAction};
HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
/*BoundArch=*/nullptr);
return HostAction;
}

// Don't build offloading actions if we do not have a compile action. If
// preprocessing only ignore embedding.
if (!(isa<CompileJobAction>(HostAction) ||
Expand Down Expand Up @@ -5084,6 +5066,21 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
DDep.add(*FatbinAction,
*C.getOffloadToolChains<Action::OFK_HIP>().first->second, nullptr,
Action::OFK_HIP);
} else if (HIPNoRDC) {
// Package all the offloading actions into a single output that can be
// embedded in the host and linked.
Action *PackagerAction =
C.MakeAction<OffloadPackagerJobAction>(OffloadActions, types::TY_Image);

// For HIP non-RDC compilation, wrap the device binary with linker wrapper
// before bundling with host code. Do not bind a specific GPU arch here,
// as the packaged image may contain entries for multiple GPUs.
ActionList AL{PackagerAction};
PackagerAction =
C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_HIP_FATBIN);
DDep.add(*PackagerAction,
*C.getOffloadToolChains<Action::OFK_HIP>().first->second,
/*BoundArch=*/nullptr, Action::OFK_HIP);
} else {
// Package all the offloading actions into a single output that can be
// embedded in the host and linked.
Expand Down Expand Up @@ -5215,6 +5212,14 @@ Action *Driver::ConstructPhaseAction(
return C.MakeAction<CompileJobAction>(Input, types::TY_LLVM_BC);
}
case phases::Backend: {
// Skip a redundant Backend phase for HIP device code when using the new
// offload driver, where mid-end is done in linker wrapper.
if (TargetDeviceOffloadKind == Action::OFK_HIP &&
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false) &&
!offloadDeviceOnly())
return Input;

if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
types::ID Output;
if (Args.hasArg(options::OPT_ffat_lto_objects) &&
Expand All @@ -5234,7 +5239,8 @@ Action *Driver::ConstructPhaseAction(
if (Args.hasArg(options::OPT_emit_llvm) ||
TargetDeviceOffloadKind == Action::OFK_SYCL ||
(((Input->getOffloadingToolChain() &&
Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
Input->getOffloadingToolChain()->getTriple().isAMDGPU() &&
TargetDeviceOffloadKind != Action::OFK_None) ||
TargetDeviceOffloadKind == Action::OFK_HIP) &&
((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false) ||
Expand Down
16 changes: 8 additions & 8 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7816,7 +7816,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
} else if (!HostOffloadingInputs.empty()) {
if (IsCuda && !IsRDCMode) {
if ((IsCuda || IsHIP) && !IsRDCMode) {
assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
Expand Down Expand Up @@ -9286,7 +9286,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
const ToolChain &TC) {
// CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_HIP_FATBIN)
return false;
return (Set.contains(A->getOption().getID()) ||
(A->getOption().getGroup().isValid() &&
Expand Down Expand Up @@ -9390,7 +9390,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
// non-RDC mode compilation. This confuses default CMake implicit linker
// argument parsing when the language is set to HIP and the system linker is
// also `ld.lld`.
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_HIP_FATBIN)
CmdArgs.push_back("--wrapper-verbose");
if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
CmdArgs.push_back(
Expand Down Expand Up @@ -9462,14 +9462,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,

// We use action type to differentiate two use cases of the linker wrapper.
// TY_Image for normal linker wrapper work.
// TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
// object.
assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
if (JA.getType() == types::TY_Object) {
// TY_HIP_FATBIN for HIP fno-gpu-rdc emitting a fat binary without wrapping.
assert(JA.getType() == types::TY_HIP_FATBIN ||
JA.getType() == types::TY_Image);
if (JA.getType() == types::TY_HIP_FATBIN) {
CmdArgs.push_back("--emit-fatbin-only");
CmdArgs.append({"-o", Output.getFilename()});
for (auto Input : Inputs)
CmdArgs.push_back(Input.getFilename());
CmdArgs.push_back("-r");
} else
for (const char *LinkArg : LinkCommand->getArguments())
CmdArgs.push_back(LinkArg);
Expand Down
42 changes: 14 additions & 28 deletions clang/lib/Headers/avx512fintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1450,26 +1450,19 @@ _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
(__v8df)_mm512_sqrt_round_pd((A), (R)), \
(__v8df)_mm512_setzero_pd()))

static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_sqrt_pd(__m512d __A)
{
return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
_MM_FROUND_CUR_DIRECTION);
static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_sqrt_pd(__m512d __A) {
return (__m512d)__builtin_elementwise_sqrt((__v8df)__A);
}

static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
return (__m512d)__builtin_ia32_selectpd_512(__U,
(__v8df)_mm512_sqrt_pd(__A),
_mm512_mask_sqrt_pd(__m512d __W, __mmask8 __U, __m512d __A) {
return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_sqrt_pd(__A),
(__v8df)__W);
}

static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
{
return (__m512d)__builtin_ia32_selectpd_512(__U,
(__v8df)_mm512_sqrt_pd(__A),
_mm512_maskz_sqrt_pd(__mmask8 __U, __m512d __A) {
return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_sqrt_pd(__A),
(__v8df)_mm512_setzero_pd());
}

Expand All @@ -1486,26 +1479,19 @@ _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
(__v16sf)_mm512_sqrt_round_ps((A), (R)), \
(__v16sf)_mm512_setzero_ps()))

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_sqrt_ps(__m512 __A)
{
return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
_MM_FROUND_CUR_DIRECTION);
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_sqrt_ps(__m512 __A) {
return (__m512)__builtin_elementwise_sqrt((__v16sf)__A);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
{
return (__m512)__builtin_ia32_selectps_512(__U,
(__v16sf)_mm512_sqrt_ps(__A),
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) {
return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_sqrt_ps(__A),
(__v16sf)__W);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
{
return (__m512)__builtin_ia32_selectps_512(__U,
(__v16sf)_mm512_sqrt_ps(__A),
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_ps(__mmask16 __U, __m512 __A) {
return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_sqrt_ps(__A),
(__v16sf)_mm512_setzero_ps());
}

Expand Down
Loading