Merge branch 'main' of https://github.com/bigscience-workshop/petals into danO

iateadonut · iateadonut · commit f6be1f7ec2c5 · 2023-09-01T00:11:42.000+09:00
diff --git a/src/petals/client/inference_session.py b/src/petals/client/inference_session.py
@@ -343,7 +343,7 @@ def _update_sequence(self, server_idx: int, block_idx: int, attempt_no: int) ->
         n_prev_spans = len(self._server_sessions)
         update_end = self._server_sessions[server_idx].span.end if server_idx < n_prev_spans else self.num_blocks
         if attempt_no >= 1:
-            logger.info(
+            logger.debug(
                 f"Due to a server failure, remote attention caches "
                 f"from block {block_idx} to {update_end} will be regenerated"
             )
diff --git a/src/petals/client/remote_generation.py b/src/petals/client/remote_generation.py
@@ -69,6 +69,8 @@ def generate(
         self, inputs: Optional[torch.Tensor] = None, *args, session: Optional[InferenceSession] = None, **kwargs
     ):
         self._fix_generate_kwargs(kwargs)
+        if inputs is None:
+            inputs = kwargs.pop("input_ids", None)
 
         if session is not None:
             # If a session specified explicitly, use it
@@ -125,7 +127,7 @@ def generate(
         return result
 
     @staticmethod
-    def _fix_generate_kwargs(kwargs: dict) -> dict:
+    def _fix_generate_kwargs(kwargs: dict):
         # Suppress inappropriate "Both max_new_tokens and max_length" HF warning
         if "max_length" in kwargs and kwargs["max_length"] is None:
             del kwargs["max_length"]
@@ -135,8 +137,6 @@ def _fix_generate_kwargs(kwargs: dict) -> dict:
         if isinstance(do_sample, int):
             kwargs["do_sample"] = bool(do_sample)
 
-        return kwargs
-
     @staticmethod
     def _reorder_cache(past_key_values: RemotePastKeyValues, beam_idx: torch.LongTensor) -> RemotePastKeyValues:
         return dataclasses.replace(past_key_values, hypo_ids=beam_idx)
diff --git a/src/petals/data_structures.py b/src/petals/data_structures.py
@@ -20,6 +20,19 @@ class ServerState(Enum):
 RPS = pydantic.confloat(ge=0, allow_inf_nan=False, strict=True)
 
 
+@pydantic.dataclasses.dataclass
+class ModelInfo:
+    num_blocks: int
+    repository: Optional[str] = None
+
+    def to_dict(self) -> dict:
+        return dataclasses.asdict(self)
+
+    @classmethod
+    def from_dict(cls, source: dict):
+        return cls(**source)
+
+
 @pydantic.dataclasses.dataclass
 class ServerInfo:
     state: ServerState
diff --git a/src/petals/models/bloom/config.py b/src/petals/models/bloom/config.py
@@ -30,5 +30,6 @@ def from_pretrained(
         if loading_from_repo and dht_prefix is None:
             # We need "-petals" for backward compatibility with Petals < 1.2.0
             dht_prefix = str(model_name_or_path) + "-petals"
+            dht_prefix = dht_prefix.replace(".", "-")
             logger.info(f"Using DHT prefix: {dht_prefix}")
         return super().from_pretrained(model_name_or_path, *args, dht_prefix=dht_prefix, **kwargs)
diff --git a/src/petals/models/llama/config.py b/src/petals/models/llama/config.py
@@ -35,6 +35,7 @@ def from_pretrained(
         if loading_from_repo and dht_prefix is None:
             dht_prefix = str(model_name_or_path)
             dht_prefix = dht_prefix.split("/")[-1]  # Use only repo name to merge blocks hosted by different accounts
+            dht_prefix = dht_prefix.replace(".", "-")
             if not dht_prefix.endswith("-hf"):
                 dht_prefix += "-hf"
             logger.info(f"Using DHT prefix: {dht_prefix}")
diff --git a/src/petals/server/memory_cache.py b/src/petals/server/memory_cache.py
@@ -31,7 +31,7 @@ def __init__(self, max_size_bytes: Optional[int], max_alloc_timeout: Optional[fl
         self.max_alloc_timeout = max_alloc_timeout
         self._lock_metadata = mp.Lock()
         self._current_size = mp.Value(ctypes.c_int64, 0, lock=False)
-        self._enqueued_size = mp.Value(ctypes.c_int64, 0, lock=False)
+        self._enqueued_size = mp.Value(ctypes.c_int64, 0, lock=True)
         self._handle_counter = mp.Value(ctypes.c_int64, 0, lock=False)
         self._allocated_tensors: Dict[Handle, torch.Tensor] = {}
         self.runtime_pid = os.getpid()
@@ -138,7 +138,8 @@ async def _wait_for_free_memory(self, alloc_size: int, timeout: Optional[float])
         start_time = time.perf_counter()
         loop = asyncio.get_event_loop()
 
-        self.enqueued_size_bytes += alloc_size
+        with self._enqueued_size.get_lock():
+            self._enqueued_size.value += alloc_size
         allocated = False
         try:
             context_manager = async_timeout.timeout(timeout) if timeout != 0 else contextlib.AsyncExitStack()
@@ -155,13 +156,15 @@ async def _wait_for_free_memory(self, alloc_size: int, timeout: Optional[float])
                         await loop.run_in_executor(None, self._wait_until_available, alloc_size, remaining_timeout)
 
                 allocated = True
-                self.enqueued_size_bytes -= alloc_size
+                with self._enqueued_size.get_lock():
+                    self._enqueued_size.value -= alloc_size
                 yield
         except asyncio.TimeoutError:
             raise AllocationFailed(f"Could not allocate {alloc_size} within {timeout} seconds")
         finally:
             if not allocated:
-                self.enqueued_size_bytes -= alloc_size
+                with self._enqueued_size.get_lock():
+                    self._enqueued_size.value -= alloc_size
 
     def _free(self, alloc_size: int, alloc_task: asyncio.Task):
         if alloc_task.exception() is not None:
diff --git a/src/petals/server/server.py b/src/petals/server/server.py
@@ -3,6 +3,7 @@
 import gc
 import math
 import multiprocessing as mp
+import os
 import random
 import threading
 import time
@@ -21,7 +22,7 @@
 
 import petals
 from petals.constants import DTYPE_MAP, PUBLIC_INITIAL_PEERS
-from petals.data_structures import CHAIN_DELIMITER, UID_DELIMITER, ServerInfo, ServerState
+from petals.data_structures import CHAIN_DELIMITER, UID_DELIMITER, ModelInfo, ServerInfo, ServerState
 from petals.server import block_selection
 from petals.server.backend import TransformerBackend, merge_inference_pools_inplace
 from petals.server.block_utils import get_block_size, resolve_block_dtype
@@ -259,11 +260,15 @@ def __init__(
             using_relay=reachable_via_relay,
             **throughput_info,
         )
+        self.model_info = ModelInfo(num_blocks=self.block_config.num_hidden_layers)
+        if not os.path.isdir(converted_model_name_or_path):
+            self.model_info.repository = "https://huggingface.co/" + converted_model_name_or_path
 
         self.balance_quality = balance_quality
         self.mean_balance_check_period = mean_balance_check_period
         self.mean_block_selection_delay = mean_block_selection_delay
 
+        self.module_container = None
         self.stop = threading.Event()
 
     def _choose_num_blocks(self) -> int:
@@ -329,6 +334,7 @@ def run(self):
                 block_config=self.block_config,
                 attn_cache_bytes=self.attn_cache_bytes,
                 server_info=self.server_info,
+                model_info=self.model_info,
                 block_indices=block_indices,
                 num_handlers=self.num_handlers,
                 min_batch_size=self.min_batch_size,
@@ -377,7 +383,7 @@ def run(self):
             self._clean_memory_and_fds()
 
     def _clean_memory_and_fds(self):
-        del self.module_container
+        self.module_container = None
         gc.collect()  # In particular, this closes unused file descriptors
 
         if self.device.type == "cuda":
@@ -410,8 +416,10 @@ def _should_choose_other_blocks(self) -> bool:
         module_infos = get_remote_module_infos(self.dht, self.module_uids, latest=True)
         return block_selection.should_choose_other_blocks(self.dht.peer_id, module_infos, self.balance_quality)
 
-    def shutdown(self):
+    def shutdown(self, timeout: Optional[float] = 5):
         self.stop.set()
+        if self.module_container is not None and self.module_container.is_alive():
+            self.module_container.join(timeout)
 
         if self.reachability_protocol is not None:
             self.reachability_protocol.shutdown()
@@ -433,6 +441,7 @@ def create(
         block_config: PretrainedConfig,
         attn_cache_bytes: int,
         server_info: ServerInfo,
+        model_info: ModelInfo,
         block_indices: List[int],
         min_batch_size: int,
         max_batch_size: int,
@@ -460,6 +469,7 @@ def create(
             module_uids,
             dht,
             server_info,
+            model_info,
             block_config=block_config,
             memory_cache=memory_cache,
             update_period=update_period,
@@ -668,6 +678,7 @@ def __init__(
         module_uids: List[str],
         dht: DHT,
         server_info: ServerInfo,
+        model_info: ModelInfo,
         *,
         block_config: PretrainedConfig,
         memory_cache: MemoryCache,
@@ -680,6 +691,7 @@ def __init__(
         self.module_uids = module_uids
         self.dht = dht
         self.server_info = server_info
+        self.model_info = model_info
         self.memory_cache = memory_cache
 
         self.bytes_per_token = block_config.hidden_size * get_size_in_bytes(DTYPE_MAP[server_info.torch_dtype])
@@ -690,10 +702,10 @@ def __init__(
         self.trigger = threading.Event()
 
         self.max_pinged = max_pinged
-        dht_prefix = module_uids[0].split(UID_DELIMITER)[0]
+        self.dht_prefix = module_uids[0].split(UID_DELIMITER)[0]
         block_indices = [int(uid.split(UID_DELIMITER)[-1]) for uid in module_uids]
         start_block, end_block = min(block_indices), max(block_indices) + 1
-        self.next_uids = [f"{dht_prefix}{UID_DELIMITER}{i}" for i in range(start_block + 1, end_block + 1)]
+        self.next_uids = [f"{self.dht_prefix}{UID_DELIMITER}{i}" for i in range(start_block + 1, end_block + 1)]
         self.ping_aggregator = PingAggregator(self.dht)
 
     def run(self) -> None:
@@ -717,6 +729,13 @@ def run(self) -> None:
             )
             if self.server_info.state == ServerState.OFFLINE:
                 break
+            if not self.dht_prefix.startswith("_"):  # Not private
+                self.dht.store(
+                    key="_petals.models",
+                    subkey=self.dht_prefix,
+                    value=self.model_info.to_dict(),
+                    expiration_time=get_dht_time() + self.expiration,
+                )
 
             delay = self.update_period - (time.perf_counter() - start_time)
             if delay < 0:
diff --git a/src/petals/server/task_pool.py b/src/petals/server/task_pool.py
@@ -9,7 +9,6 @@
 
 import torch
 from hivemind import get_logger
-from hivemind.moe.server.task_pool import TaskPoolBase
 from hivemind.utils.mpfuture import ALL_STATES, MPFuture
 
 logger = get_logger(__name__)
@@ -27,7 +26,7 @@ def uid(self) -> int:
         return self.future._uid
 
 
-class PrioritizedTaskPool(TaskPoolBase):
+class PrioritizedTaskPool(threading.Thread):
     """
     Aggregates requests from multiple ConnectionHandler instances, orders them for processing in Runtime, then
     returns results (or exception) to the corresponding ConnectionHandler. Runs a background process.
@@ -57,52 +56,41 @@ def __init__(
         daemon=True,
         start=False,
     ):
-        super().__init__(process_func, daemon=daemon, name=name)
+        super().__init__(daemon=daemon, name=name)
+        self.process_func = process_func
+        # the lower the priority is, the more urgent it is to process this pool
+        self._priority = mp.Value(ctypes.c_double, 1.0)
+
         self.min_batch_size, self.max_batch_size = min_batch_size, max_batch_size
         self.device = device
 
         self.submitted_tasks = mp.SimpleQueue()  # interaction with ConnectionHandlers
         self._ordered_tasks = PriorityQueue()  # interaction with Runtime - only valid inside Runtime
 
-        self._prioritizer_thread = threading.Thread(
-            name=self.name + "_prioritizer",
-            target=self._prioritize_tasks,
-            args=[self.submitted_tasks, self._ordered_tasks],
-            daemon=True,
-        )
         self._dispatched_tasks = {}
         self.batch_receiver, self.batch_sender = mp.Pipe(duplex=False)
         self._oldest_undispatched_timestamp = mp.Value(ctypes.c_double, 1.0)
         self.priority = float("inf"), float("inf")  # (first task priority, first task timestamp)
 
-        self._stop = mp.Event()
         if start:
             self.start()
 
-    @staticmethod
-    def _prioritize_tasks(submitted_tasks: mp.SimpleQueue, ordered_tasks: PriorityQueue):
+    def run(self):
         """Read tasks from incoming queue and put them into a local priority queue"""
         while True:
-            task = submitted_tasks.get()
+            task = self.submitted_tasks.get()
             if task is None:
                 logger.debug("Shutting down prioritizer thread")
                 break
 
-            ordered_tasks.put(task, block=True)
-
-    def start(self):
-        assert not self.is_alive() and not self._prioritizer_thread.is_alive()
-        self._prioritizer_thread.start()
-        super().start()
+            self._ordered_tasks.put(task, block=True)
 
-    def shutdown(self, timeout: float = 3):
-        self.submitted_tasks.put(None)  # Shuts down self._prioritizer_thread
-        self._stop.set()
+    def terminate(self):
+        """An alias for hivemind.Runtime that assumes that each TaskPool is a process"""
+        self.shutdown()
 
-        self.join(timeout)
-        if self.is_alive():
-            logger.warning(f"{self.__class__.__name__} failed to shut down gracefully, sending SIGTERM")
-            self.terminate()
+    def shutdown(self):
+        self.submitted_tasks.put(None)  # Shuts down self.run()
 
     def submit_task(self, *args: Any, priority: float = 0.0) -> MPFuture:
         """Add task to this pool's queue, return Future for its output"""
@@ -163,9 +151,6 @@ def send_exception_from_runtime(self, uid: int, exception: BaseException):
         else:
             task.future.set_exception(exception)
 
-    def run(self, *args, **kwargs):
-        self._stop.wait()
-
     @property
     def empty(self):
         return not self.batch_receiver.poll()
diff --git a/tests/test_full_model.py b/tests/test_full_model.py
@@ -149,3 +149,23 @@ def test_beam_search_generation(tokenizer, model, ref_model, max_new_tokens=4, n
     outputs = make_generate_calls(model, inputs, **options)
     ref_outputs = ref_model.generate(inputs, **options)
     assert torch.allclose(outputs, ref_outputs), f"Beam search results are not identical to HF"
+
+
+@pytest.mark.forked
+def test_input_ids(tokenizer, model, ref_model, max_new_tokens=4):
+    inputs = tokenizer("A cat sat on a mat", return_tensors="pt")
+    assert inputs.keys() == {"input_ids", "attention_mask"}
+
+    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    ref_outputs = ref_model.generate(**inputs, max_new_tokens=max_new_tokens)
+    assert torch.allclose(outputs, ref_outputs), f"Outputs are not identical to HF"
+
+    with model.inference_session(max_length=inputs["input_ids"].shape[1] + max_new_tokens):
+        outputs = torch.cat(
+            [
+                model.generate(**inputs, max_new_tokens=2),
+                model.generate(None, max_new_tokens=max_new_tokens - 2),
+            ],
+            dim=1,
+        )
+    assert torch.allclose(outputs, ref_outputs), f"Multi-call outputs are not identical to HF"
diff --git a/tests/test_remote_sequential.py b/tests/test_remote_sequential.py
@@ -126,6 +126,6 @@ def test_remote_sequential_prompts(batch_size=2, seq_len=5, pre_seq_len=3):
 
     (outputs_ref * output_proj).sum().backward()
     assert input_prompts_ref.grad is not None
-    assert torch.allclose(input_prompts_ref.grad, input_prompts.grad, atol=1e-2)
+    assert torch.allclose(input_prompts_ref.grad, input_prompts.grad, atol=3e-2)
     assert intermediate_prompts_ref.grad is not None
     assert torch.allclose(intermediate_prompts_ref.grad, intermediate_prompts.grad, atol=1e-2)

Original file line number	Diff line number	Diff line change
`@@ -343,7 +343,7 @@ def _update_sequence(self, server_idx: int, block_idx: int, attempt_no: int) ->`
`343`	`343`	`n_prev_spans = len(self._server_sessions)`
`344`	`344`	`update_end = self._server_sessions[server_idx].span.end if server_idx < n_prev_spans else self.num_blocks`
`345`	`345`	`if attempt_no >= 1:`
`346`		`- logger.info(`
	`346`	`+ logger.debug(`
`347`	`347`	`f"Due to a server failure, remote attention caches "`
`348`	`348`	`f"from block {block_idx} to {update_end} will be regenerated"`
`349`	`349`	`)`