Skip to content

Commit 87d8a16

Browse files
author
pytorchbot
committed
2025-07-01 nightly release (8325430)
1 parent 39dc234 commit 87d8a16

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+700
-1111
lines changed

.github/scripts/fbgemm_gpu_build.bash

Lines changed: 22 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -167,42 +167,31 @@ __configure_fbgemm_gpu_build_docs () {
167167
__configure_fbgemm_gpu_build_rocm () {
168168
local fbgemm_variant_targets="$1"
169169

170-
# Fetch available ROCm architectures on the machine
170+
# By default, we build for a limited number of target architectures to save on
171+
# build time. This list needs to be updated if the CI ROCm machines have
172+
# different hardware.
173+
#
174+
# Target architecture mapping and ROCm compatibility table can be found at:
175+
# https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
176+
# https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html
177+
171178
if [ "$fbgemm_variant_targets" != "" ]; then
172-
echo "[BUILD] ROCm targets have been manually provided: ${fbgemm_variant_targets}"
179+
# If targets are manually supplied, override
180+
echo "[BUILD] Using the user-supplied ROCm targets ..."
173181
local arch_list="${fbgemm_variant_targets}"
174182

183+
elif [ -n "${BUILD_FROM_NOVA+x}" ]; then
184+
# If BUILD_FROM_NOVA is set (regardless of 0 or 1 - some steps in Nova have
185+
# the value set to 0), we are building in Nova. Nova machines take much
186+
# longer time to build FBGEMM_GPU for ROCm, so we have to limit to just the
187+
# latest model.
188+
echo "[BUILD] Building in Nova environment, ignoring the provided PYTORCH_ROCM_ARCH list and limiting ROCm targets ..."
189+
local arch_list="gfx942"
190+
175191
else
176-
if which rocminfo; then
177-
# shellcheck disable=SC2155
178-
local arch_list=$(rocminfo | grep -o -m 1 'gfx.*')
179-
echo "[BUILD] Architectures list from rocminfo: ${arch_list}"
180-
181-
if [ "$arch_list" == "" ]; then
182-
# It is possible to build FBGEMM_GPU-ROCm on a machine without AMD
183-
# cards, in which case the arch_list will be empty.
184-
echo "[BUILD] rocminfo did not return anything valid!"
185-
186-
# By default, we build for a limited number of architectures to save on
187-
# build time. This list needs to be updated if the CI ROCm machines
188-
# have different hardware.
189-
#
190-
# Architecture mapping can be found at:
191-
# https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
192-
if [ -z "${BUILD_FROM_NOVA+x}" ]; then
193-
# If BUILD_FROM_NOVA is unset, then we are building from AMD host with
194-
# sufficient resources, so we can build for more architectures.
195-
local arch_list="gfx908,gfx90a,gfx942"
196-
else
197-
# If BUILD_FROM_NOVA is set (regardless of 0 or 1), we are building in
198-
# Nova. Nova machines take a longer time to build FBGEMM_GPU for
199-
# ROCm, so we limit to one architecture.
200-
local arch_list="gfx942"
201-
fi
202-
fi
203-
else
204-
echo "[BUILD] rocminfo not found in PATH!"
205-
fi
192+
# If BUILD_FROM_NOVA is unset, then we are building from a compute host with
193+
# sufficient resources, so we can build for more AMD Instinct architectures.
194+
local arch_list="gfx908,gfx90a,gfx942"
206195
fi
207196

208197
echo "[BUILD] Setting the following ROCm targets: ${arch_list}"
@@ -286,8 +275,8 @@ __configure_fbgemm_gpu_build_cuda () {
286275
echo "[BUILD] Unknown NVCC version $cuda_version_nvcc - setting TORCH_CUDA_ARCH_LIST to: ${arch_list}"
287276
fi
288277
fi
289-
echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
290278

279+
echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
291280
# Unset the environment-supplied TORCH_CUDA_ARCH_LIST because it will take
292281
# precedence over cmake -DTORCH_CUDA_ARCH_LIST
293282
unset TORCH_CUDA_ARCH_LIST

.github/scripts/utils_rocm.bash

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,11 @@ install_rocm_ubuntu () {
9999
echo "[INSTALL] Cleaning up ..."
100100
print_exec rm -f "${package_name}"
101101

102-
echo "[INFO] Check ROCM GPU info ..."
102+
echo "[INFO] Printing ROCM utilities info ..."
103103
# If rocm-smi is installed on a machine without GPUs, this will return error
104104
(print_exec rocminfo) || true
105-
print_exec rocm-smi
105+
(print_exec rocm-smi) || true
106+
(print_exec hipcc -v) || true
106107

107108
echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
108109
}

fbgemm_gpu/FbgemmGpu.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,5 @@ gpu_cpp_library(
183183
fbgemm_gpu_tbe_cache
184184
fbgemm_gpu_tbe_optimizers
185185
fbgemm_gpu_tbe_utils
186-
fbgemm_gpu_config
187186
DESTINATION
188187
fbgemm_gpu)

fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,13 +130,12 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no
130130

131131
// Construct output tensor
132132
Tensor output;
133-
const int kINT8QparamsBytes = 8;
134133

135134
SparseType o_dtype = static_cast<SparseType>(output_dtype);
136135
TORCH_CHECK(o_dtype == SparseType::FP32 || o_dtype == SparseType::FP16 || o_dtype == SparseType::BF16 || o_dtype == SparseType::INT8);
137136

138137
{%- if not nobag %}
139-
138+
const int kINT8QparamsBytes = 8;
140139
int64_t total_adjusted_D = total_D;
141140
if (o_dtype == SparseType::INT8) {
142141
total_adjusted_D += T * kINT8QparamsBytes;
@@ -149,10 +148,11 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no
149148
}
150149

151150
{%- else %}
152-
151+
// TODO: Change to use half to match CPU/Meta implementation
152+
const int kINT8QparamsBytes = 8; // using float for scale and bias
153153
int64_t adjusted_D = D;
154154
if (o_dtype == SparseType::INT8) {
155-
adjusted_D += T * kINT8QparamsBytes;
155+
adjusted_D += kINT8QparamsBytes;
156156
}
157157

158158
if (total_L == 0) {

0 commit comments

Comments
 (0)