diff --git a/llm/inference/llama3/run_llama3.py b/llm/inference/llama3/run_llama3.py index bc0d9b49f..d8144260e 100644 --- a/llm/inference/llama3/run_llama3.py +++ b/llm/inference/llama3/run_llama3.py @@ -1,7 +1,5 @@ import mindspore from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM -from mindspore._c_expression import _framework_profiler_step_start -from mindspore._c_expression import _framework_profiler_step_end model_id = "LLM-Research/Meta-Llama-3-8B-Instruct" @@ -28,7 +26,6 @@ tokenizer.convert_tokens_to_ids("<|eot_id|>") ] -# _framework_profiler_step_start() outputs = model.generate( input_ids, max_new_tokens=20, @@ -38,6 +35,5 @@ # temperature=0.6, # top_p=0.9, ) -# _framework_profiler_step_end() response = outputs[0][input_ids.shape[-1]:] print(tokenizer.decode(response, skip_special_tokens=True)) diff --git a/mindnlp/transformers/generation/utils.py b/mindnlp/transformers/generation/utils.py index 1b94d05f7..4cf4cba3e 100644 --- a/mindnlp/transformers/generation/utils.py +++ b/mindnlp/transformers/generation/utils.py @@ -13,18 +13,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# pylint: disable=not-callable +# pylint: disable=not-callable, no-name-in-module """generation mixin""" import copy import inspect import warnings +import time from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import mindspore +from mindspore._c_expression import _framework_profiler_step_start +from mindspore._c_expression import _framework_profiler_step_end + from mindnlp.core import nn, ops, no_grad from mindnlp.core.nn import functional as F +from ...utils.testing_utils import parse_flag_from_env from ..cache_utils import ( Cache, @@ -156,6 +161,7 @@ class GenerateDecoderOnlyOutput(ModelOutput): attentions: Optional[Tuple[Tuple[mindspore.Tensor]]] = None hidden_states: Optional[Tuple[Tuple[mindspore.Tensor]]] = None past_key_values: Optional[Tuple[Tuple[Tuple[mindspore.Tensor]]]] = None + average_infer_time: Optional[float] = None @dataclass @@ -208,6 +214,7 @@ class GenerateEncoderDecoderOutput(ModelOutput): cross_attentions: Optional[Tuple[Tuple[mindspore.Tensor]]] = None decoder_hidden_states: Optional[Tuple[Tuple[mindspore.Tensor]]] = None past_key_values: Optional[Tuple[Tuple[Tuple[mindspore.Tensor]]]] = None + average_infer_time: Optional[float] = None @dataclass @@ -1642,6 +1649,7 @@ def generate( - [`~generation.GenerateEncoderDecoderOutput`], - [`~generation.GenerateBeamEncoderDecoderOutput`] """ + _run_profiler = parse_flag_from_env('MS_ENABLE_RUNTIME_PROFILER', False) # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call self._validate_model_class() tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria @@ -1850,6 +1858,11 @@ def generate( prepared_stopping_criteria = self._get_stopping_criteria( generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs ) + + if _run_profiler: + _framework_profiler_step_start() + logger.warning('Enabling the profiler will generate larger files. Please set `max_length` or `max_new_tokens` to a smaller value (recommended less than 10)') + # 10. go into different generation modes if generation_mode == GenerationMode.ASSISTED_GENERATION: if generation_config.num_return_sequences > 1: @@ -2107,6 +2120,9 @@ def typeerror(): **model_kwargs, ) + if _run_profiler: + _framework_profiler_step_end() + # Convert to legacy cache if needed if use_dynamic_cache_by_default and generation_config.return_legacy_cache: if isinstance(result, ModelOutput) and hasattr(result, "past_key_values"): @@ -2902,9 +2918,14 @@ def _sample( unfinished_sequences = ops.ones(batch_size, dtype=mindspore.int64) model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + time_record = [] + _record_time = parse_flag_from_env('INFERENCE_TIME_RECORD', False) + while self._has_unfinished_sequences( this_peer_finished, synced_gpus, cur_len=cur_len, max_length=max_length ): + if _record_time: + infer_start = time.time() # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) @@ -2971,10 +2992,20 @@ def _sample( this_peer_finished = unfinished_sequences.max() == 0 cur_len += 1 + if _record_time: + infer_stop = time.time() + time_record.append(infer_stop - infer_start) # This is needed to properly delete outputs.logits which may be very large for first iteration # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration del outputs + average_infer_time = None + if time_record: + time_record.pop(0) + average_infer_time = sum(time_record) / len(time_record) + print(f'average inference time is: {average_infer_time}') + print(f'inference time record: {time_record}') + if streamer is not None: streamer.end() @@ -2990,6 +3021,7 @@ def _sample( cross_attentions=cross_attentions, decoder_hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), + average_infer_time=average_infer_time ) else: return GenerateDecoderOnlyOutput( @@ -2999,6 +3031,7 @@ def _sample( attentions=decoder_attentions, hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), + average_infer_time=average_infer_time ) else: return input_ids @@ -3130,7 +3163,13 @@ def _beam_search( decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder + + time_record = [] + _record_time = parse_flag_from_env('INFERENCE_TIME_RECORD', False) + while self._has_unfinished_sequences(this_peer_finished, synced_gpus): + if _record_time: + infer_start = time.time() model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) # prepare variable output controls (note: some models won't accept all output controls) @@ -3295,6 +3334,17 @@ def replace_negative_indices(next_tokens): if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)): this_peer_finished = True + if _record_time: + infer_stop = time.time() + time_record.append(infer_stop - infer_start) + + average_infer_time = None + if time_record: + time_record.pop(0) + average_infer_time = sum(time_record) / len(time_record) + print(f'average inference time is: {average_infer_time}') + print(f'inference time record: {time_record}') + sequence_outputs = beam_scorer.finalize( input_ids, beam_scores, diff --git a/scripts/run_pynative_profile.sh b/scripts/run_pynative_profile.sh index a2675f60f..ab7c2537e 100644 --- a/scripts/run_pynative_profile.sh +++ b/scripts/run_pynative_profile.sh @@ -1 +1,2 @@ -export MS_ENABLE_RUNTIME_PROFILER=1 \ No newline at end of file +export MS_ENABLE_RUNTIME_PROFILER=1 +# export INFERENCE_TIME_RECORD=1 diff --git a/tests/ut/transformers/models/bert/test_modeling_bert.py b/tests/ut/transformers/models/bert/test_modeling_bert.py index f3d66f80e..fdf02c882 100644 --- a/tests/ut/transformers/models/bert/test_modeling_bert.py +++ b/tests/ut/transformers/models/bert/test_modeling_bert.py @@ -677,4 +677,19 @@ def test_sdpa_ignored_mask(self): res_sdpa = model_sdpa(**inp, past_key_values=pkv) self.assertTrue( ops.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-3, rtol=1e-3) - ) \ No newline at end of file + ) + + @slow + def test_inference_time(self): + import time + model = BertModel.from_pretrained("google-bert/bert-base-uncased") + input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + infer_time = [] + with no_grad(): + for i in range(20): + s = time.time() + output = model(input_ids, attention_mask=attention_mask)[0] + t = time.time() + infer_time.append(t - s) + print(infer_time) diff --git a/tests/ut/transformers/models/clip/test_modeling_clip.py b/tests/ut/transformers/models/clip/test_modeling_clip.py index 495301283..63e06697f 100644 --- a/tests/ut/transformers/models/clip/test_modeling_clip.py +++ b/tests/ut/transformers/models/clip/test_modeling_clip.py @@ -12,18 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Testing suite for the PyTorch CLIP model.""" +"""Testing suite for the MindSpore CLIP model.""" import inspect -import os import tempfile import unittest -from typing import Optional, Tuple import numpy as np import requests -from parameterized import parameterized -from pytest import mark from mindnlp.transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig from mindnlp.utils.testing_utils import ( @@ -647,3 +643,26 @@ def test_inference(self): print(outputs.logits_per_image) self.assertTrue(ops.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)) + + @slow + def test_inference_time(self): + import time + model_name = "openai/clip-vit-base-patch32" + model = CLIPModel.from_pretrained(model_name) + processor = CLIPProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="ms" + ) + + infer_time = [] + # forward pass + with no_grad(): + for i in range(20): + s = time.time() + outputs = model(**inputs) + t = time.time() + infer_time.append(t - s) + + print(infer_time) diff --git a/tests/ut/transformers/models/mixtral/test_modeling_mixtral.py b/tests/ut/transformers/models/mixtral/test_modeling_mixtral.py index 59c188185..03257d83a 100644 --- a/tests/ut/transformers/models/mixtral/test_modeling_mixtral.py +++ b/tests/ut/transformers/models/mixtral/test_modeling_mixtral.py @@ -446,3 +446,14 @@ def test_small_model_logits_batched(self): self.assertTrue(np.allclose(logits[0, :3, :3].half().asnumpy(), EXPECTED_LOGITS_LEFT.asnumpy(), atol=1e-3, rtol=1e-3)) self.assertTrue(np.allclose(logits[0, -3:, -3:].half().asnumpy(), EXPECTED_LOGITS_LEFT_UNPADDED.asnumpy(), atol=1e-3, rtol=1e-3)) self.assertTrue(np.allclose(logits[1, -3:, -3:].half().asnumpy(), EXPECTED_LOGITS_RIGHT_UNPADDED.asnumpy(), atol=1e-3, rtol=1e-3)) + + @slow + @require_mindspore + def test_small_model_generate_time(self): + model_id = "hf-internal-testing/Mixtral-tiny" + dummy_input = mindspore.Tensor([[0, 1, 0], [0, 1, 0]]) + + model = MixtralForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16) + # TODO: might need to tweak it in case the logits do not match on our daily runners + # these logits have been obtained with the original megablocks impelmentation. + model.generate(dummy_input, max_new_tokens=20) diff --git a/tests/ut/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/ut/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py index 09b3a4ec4..96fff4e60 100644 --- a/tests/ut/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/ut/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -553,3 +553,14 @@ def test_speculative_generation(self): del model gc.collect() + + @slow + def test_model_a2_7b_generation_time(self): + EXPECTED_TEXT_COMPLETION = """To be or not to be, that is the question. This is the question that has been asked by many people over the""" + prompt = "To be or not to" + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=False, mirror='modelscope') + model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", mirror='modelscope') + input_ids = tokenizer.encode(prompt, return_tensors="ms") + + # greedy generation outputs + generated_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False) diff --git a/tests/ut/transformers/models/t5/test_modeling_t5.py b/tests/ut/transformers/models/t5/test_modeling_t5.py index a79da713e..f342ea513 100644 --- a/tests/ut/transformers/models/t5/test_modeling_t5.py +++ b/tests/ut/transformers/models/t5/test_modeling_t5.py @@ -1458,6 +1458,28 @@ def test_contrastive_search_t5(self): ) + @slow + def test_translation_inference_time(self): + model = self.model # google-t5/t5-base + tok = self.tokenizer + use_task_specific_params(model, "translation_en_to_fr") + + en_text = ( + ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of' + " countless generations of stars: the oldest stars are seen as blue dots. " + ) + + input_ids = tok.encode(model.config.prefix + en_text, return_tensors="ms") + input_ids = input_ids + + output = model.generate( + input_ids=input_ids, + max_new_tokens=50, + do_sample=False, + ) + print(output) + + @require_mindspore class TestAsymmetricT5(unittest.TestCase): def build_model_and_check_forward_pass(self, **kwargs):