pytorch
diff --git a/‎fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py‎
Lines changed: 4 additions & 4 deletions b/‎fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py‎
Lines changed: 41 additions & 62 deletions b/‎fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py‎
Lines changed: 41 additions & 62 deletions
diff --git a/‎fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_base.h‎
Lines changed: 0 additions & 51 deletions b/‎fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_base.h‎
Lines changed: 0 additions & 51 deletions
@@ -60,16 +60,16 @@ class EvictionPolicy(NamedTuple):
         0  # trigger_step_interval if trigger mode is iteration
     )
     counter_thresholds: Optional[List[int]] = (
-        None  # count_thresholds for each feature if eviction strategy is feature score
+        None  # count_thresholds for each table if eviction strategy is feature score
     )
     ttls_in_mins: Optional[List[int]] = (
-        None  # ttls_in_mins for each feature if eviction strategy is timestamp
+        None  # ttls_in_mins for each table if eviction strategy is timestamp
     )
     counter_decay_rates: Optional[List[float]] = (
-        None  # count_decay_rates for each feature if eviction strategy is feature score
+        None  # count_decay_rates for each table if eviction strategy is feature score
     )
     l2_weight_thresholds: Optional[List[float]] = (
-        None  # l2_weight_thresholds for each feature if eviction strategy is feature l2 norm
+        None  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
     )
 
 
 
@@ -248,6 +248,12 @@ def __init__(
             self.total_hash_size_bits: int = 0
         else:
             self.total_hash_size_bits: int = int(log2(float(hash_size_cumsum[-1])) + 1)
+        self.register_buffer(
+            "table_hash_size_cumsum",
+            torch.tensor(
+                hash_size_cumsum, device=self.current_device, dtype=torch.int64
+            ),
+        )
         # The last element is to easily access # of rows of each table by
         self.total_hash_size_bits = int(log2(float(hash_size_cumsum[-1])) + 1)
         self.total_hash_size: int = hash_size_cumsum[-1]
@@ -288,6 +294,10 @@ def __init__(
             "feature_dims",
             torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
         )
+        self.register_buffer(
+            "table_dims",
+            torch.tensor(dims, device="cpu", dtype=torch.int64),
+        )
 
         (info_B_num_bits_, info_B_mask_) = torch.ops.fbgemm.get_infos_metadata(
             self.D_offsets,  # unused tensor
@@ -518,6 +528,7 @@ def __init__(
                 logging.warning("dist is not initialized, treating as single gpu cases")
                 tbe_unique_id = SSDTableBatchedEmbeddingBags._local_instance_index
         self.tbe_unique_id = tbe_unique_id
+        self.l2_cache_size = l2_cache_size
         logging.info(f"tbe_unique_id: {tbe_unique_id}")
         if self.backend_type == BackendType.SSD:
             logging.info(
@@ -564,12 +575,12 @@ def __init__(
                 self.res_params.table_offsets,
                 self.res_params.table_sizes,
                 (
-                    tensor_pad4(self.feature_dims.cpu())
+                    tensor_pad4(self.table_dims)
                     if self.enable_optimizer_offloading
                     else None
                 ),
                 (
-                    self.hash_size_cumsum.cpu()
+                    self.table_hash_size_cumsum.cpu()
                     if self.enable_optimizer_offloading
                     else None
                 ),
@@ -607,74 +618,35 @@ def __init__(
                 f"self.cache_row_dim={self.cache_row_dim},"
                 f"enable_optimizer_offloading={self.enable_optimizer_offloading},"
                 f"feature_dims={self.feature_dims},"
-                f"hash_size_cumsum={self.hash_size_cumsum}, "
-                f"eviction_policy={self.kv_zch_params.eviction_policy}, "
-            )
-            # prepare eviction policy parameters
-            counter_eviction_threshold_tensor = None
-            ttls_in_mins_tensor = None
-            counter_decay_rates_tensor = None
-            l2_weight_thresholds_tensor = None
-            if self.kv_zch_params.eviction_policy.eviction_trigger_mode != 0:
-                counter_eviction_threshold = [
-                    self.kv_zch_params.eviction_policy.counter_thresholds[t]
-                    for t in self.feature_table_map
-                ]
-                counter_eviction_threshold_tensor = torch.tensor(
-                    counter_eviction_threshold,
-                    device=torch.device("cpu"),
-                    dtype=torch.uint32,
-                )
-                ttls_in_mins = [
-                    self.kv_zch_params.eviction_policy.ttls_in_mins[t]
-                    for t in self.feature_table_map
-                ]
-                ttls_in_mins_tensor = torch.tensor(
-                    ttls_in_mins,
-                    device=torch.device("cpu"),
-                    dtype=torch.uint32,
-                )
-                counter_decay_rates = [
-                    self.kv_zch_params.eviction_policy.counter_decay_rates[t]
-                    for t in self.feature_table_map
-                ]
-                counter_decay_rates_tensor = torch.tensor(
-                    counter_decay_rates,
-                    device=torch.device("cpu"),
-                    dtype=torch.float32,
-                )
-                l2_weight_thresholds = [
-                    self.kv_zch_params.eviction_policy.l2_weight_thresholds[t]
-                    for t in self.feature_table_map
-                ]
-                l2_weight_thresholds_tensor = torch.tensor(
-                    l2_weight_thresholds,
-                    device=torch.device("cpu"),
-                    dtype=torch.float32,
-                )
-
+                f"hash_size_cumsum={self.hash_size_cumsum}"
+            )
+            table_dims = (
+                tensor_pad4(self.table_dims)
+                if self.enable_optimizer_offloading
+                else None
+            )  # table_dims
+            eviction_config = torch.classes.fbgemm.FeatureEvictConfig(
+                self.kv_zch_params.eviction_policy.eviction_trigger_mode,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
+                self.kv_zch_params.eviction_policy.eviction_strategy,  # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
+                self.kv_zch_params.eviction_policy.eviction_step_intervals,  # trigger_step_interval if trigger mode is iteration
+                self.l2_cache_size,  # mem_util_threshold_in_GB if trigger mode is mem_util
+                self.kv_zch_params.eviction_policy.ttls_in_mins,  # ttls_in_mins for each table if eviction strategy is timestamp
+                self.kv_zch_params.eviction_policy.counter_thresholds,  # counter_thresholds for each table if eviction strategy is feature score
+                self.kv_zch_params.eviction_policy.counter_decay_rates,  # counter_decay_rates for each table if eviction strategy is feature score
+                self.kv_zch_params.eviction_policy.l2_weight_thresholds,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                table_dims.tolist() if table_dims else None,
+            )
             self._ssd_db = torch.classes.fbgemm.DramKVEmbeddingCacheWrapper(
                 self.cache_row_dim,
                 ssd_uniform_init_lower,
                 ssd_uniform_init_upper,
-                self.kv_zch_params.eviction_policy.eviction_trigger_mode,  # eviction is disabled, 0: disabled, 1: iteration, 2: mem_util, 3: manual
-                self.kv_zch_params.eviction_policy.eviction_step_intervals,  # trigger_step_interval if trigger mode is iteration
-                l2_cache_size,  # mem_util_threshold_in_GB if trigger mode is mem_util
-                self.kv_zch_params.eviction_policy.eviction_strategy,  # evict_trigger_strategy: 0: timestamp, 1: counter (feature score), 2: counter (feature score) + timestamp, 3: feature l2 norm
-                counter_eviction_threshold_tensor,  # counter_thresholds for each table if eviction strategy is feature score
-                ttls_in_mins_tensor,  # ttls_in_mins for each table if eviction strategy is timestamp
-                counter_decay_rates_tensor,  # counter_decay_rates for each table if eviction strategy is feature score
-                l2_weight_thresholds_tensor,  # l2_weight_thresholds for each table if eviction strategy is feature l2 norm
+                eviction_config,
                 ssd_rocksdb_shards,  # num_shards
                 ssd_rocksdb_shards,  # num_threads
                 weights_precision.bit_rate(),  # row_storage_bitwidth
+                table_dims,
                 (
-                    tensor_pad4(self.feature_dims.cpu())
-                    if self.enable_optimizer_offloading
-                    else None
-                ),  # table_dims
-                (
-                    self.hash_size_cumsum.cpu()
+                    self.table_hash_size_cumsum.cpu()
                     if self.enable_optimizer_offloading
                     else None
                 ),  # hash_size_cumsum
@@ -2478,6 +2450,13 @@ def _may_create_snapshot_for_state_dict(
                     f"created snapshot for weight states: {snapshot_handle}, latency: {(time.time() - start_time) * 1000} ms"
                 )
         elif self.backend_type == BackendType.DRAM:
+            # if there is any ongoing eviction, lets wait until eviction is finished before state_dict
+            # so that we can reach consistent model state before/after state_dict
+            evict_wait_start_time = time.time()
+            self.ssd_db.wait_until_eviction_done()
+            logging.info(
+                f"state_dict wait for ongoing eviction: {time.time() - evict_wait_start_time} s"
+            )
             self.flush(force=should_flush)
         return snapshot_handle, checkpoint_handle
Original file line number	Diff line number	Diff line change
`@@ -60,16 +60,16 @@ class EvictionPolicy(NamedTuple):`
`60`	`60`	`0 # trigger_step_interval if trigger mode is iteration`
`61`	`61`	`)`
`62`	`62`	`counter_thresholds: Optional[List[int]] = (`
`63`		`- None # count_thresholds for each feature if eviction strategy is feature score`
	`63`	`+ None # count_thresholds for each table if eviction strategy is feature score`
`64`	`64`	`)`
`65`	`65`	`ttls_in_mins: Optional[List[int]] = (`
`66`		`- None # ttls_in_mins for each feature if eviction strategy is timestamp`
	`66`	`+ None # ttls_in_mins for each table if eviction strategy is timestamp`
`67`	`67`	`)`
`68`	`68`	`counter_decay_rates: Optional[List[float]] = (`
`69`		`- None # count_decay_rates for each feature if eviction strategy is feature score`
	`69`	`+ None # count_decay_rates for each table if eviction strategy is feature score`
`70`	`70`	`)`
`71`	`71`	`l2_weight_thresholds: Optional[List[float]] = (`
`72`		`- None # l2_weight_thresholds for each feature if eviction strategy is feature l2 norm`
	`72`	`+ None # l2_weight_thresholds for each table if eviction strategy is feature l2 norm`
`73`	`73`	`)`
`74`	`74`
`75`	`75`