Fix issue #14 ram optimization

BKDDFS · web-flow · commit 1c5e06dca604 · 2024-05-21T22:13:57.000+02:00
### Release Notes - Version 2.2.0

#### New Features
- **Versioning to API Endpoints (v2)**:
  - Added versioning to API endpoints to support multiple versions and improve backward compatibility.

#### Improvements
- **Optimized Model Handling**:
  - Added functionality to skip unnecessary model downloading and evaluator creation, resulting in faster initialization times and reduced resource usage.

#### Bug Fixes
- **High RAM Usage Issue**:
  - Fixed an issue causing high RAM usage due to the accumulation of frames in memory before they are written to disk. This fix improves performance and stability, especially on systems with limited memory.
- **Python Versioning Fix**:
  - Corrected the Python versioning in `pyproject.toml` for the `fcs-filesystem` (TensorFlow) to ensure compatibility and smooth deployment.
- **Manual Garbage Collection**:
  - Implemented manual garbage collecting and deleting of references to manage memory more efficiently.
diff --git a/common/common.py b/common/common.py
@@ -48,5 +48,6 @@ def setup_best_frames_extractor_env(files_dir, best_frames_dir) -> tuple[Path, P
         shutil.rmtree(best_frames_dir)
     assert not best_frames_dir.is_dir(), "Output directory was not removed"
     best_frames_dir.mkdir()
+    assert best_frames_dir.is_dir(), "Output dir was not created after cleaning."
 
     return files_dir, best_frames_dir, expected_video_path
diff --git a/extractor_service/app/extractors.py b/extractor_service/app/extractors.py
@@ -27,6 +27,7 @@
 from abc import ABC, abstractmethod
 import logging
 from typing import Type
+import gc
 
 import numpy as np
 
@@ -232,63 +233,56 @@ def process(self) -> None:
                     self._config.input_directory)
         videos_paths = self._list_input_directory_files(self._config.video_extensions,
                                                         self._config.processed_video_prefix)
-        self._get_image_evaluator()
+        if self._config.all_frames is False:  # evaluator won't be used if all frames
+            self._get_image_evaluator()
         for video_path in videos_paths:
-            frames = self._extract_best_frames(video_path)
-            self._save_images(frames)
+            self._extract_best_frames(video_path)
             self._add_prefix(self._config.processed_video_prefix, video_path)
             logger.info("Frames extraction has finished for video: %s", video_path)
         logger.info("Extraction process finished. All frames extracted.")
         self._signal_readiness_for_shutdown()
 
-    def _extract_best_frames(self, video_path: Path) -> list[np.ndarray]:
+    def _extract_best_frames(self, video_path: Path) -> None:
         """
         Extract best visually frames from given video.
 
         Args:
             video_path (Path): Path of the video that will be extracted.
-
-        Returns:
-            list[np.ndarray]: List of best images(frames) from the given video.
         """
-        best_frames = []
         frames_batch_generator = OpenCVVideo.get_next_frames(video_path, self._config.batch_size)
         for frames in frames_batch_generator:
             if not frames:
                 continue
             logger.debug("Frames batch generated.")
-            if self._config.all_frames:
-                best_frames.extend(frames)
-                continue
-            normalized_images = self._normalize_images(frames, self._config.target_image_size)
-            scores = self._evaluate_images(normalized_images)
-            selected_frames = self._get_best_frames(frames, scores,
-                                                    self._config.compering_group_size)
-            best_frames.extend(selected_frames)
-        return best_frames
+            if not self._config.all_frames:
+                frames = self._get_best_frames(frames)
+            self._save_images(frames)
+            del frames
+            gc.collect()
 
-    @staticmethod
-    def _get_best_frames(images: list[np.ndarray], scores: np.array,
-                         comparing_group_size: int) -> list[np.ndarray]:
+    def _get_best_frames(self, frames: list[np.ndarray]) -> list[np.ndarray]:
         """
         Splits images batch for comparing groups and select best image for each group.
 
         Args:
-            images (list[np.ndarray]): Batch of images in numpy ndarray.
-            scores (np.array): Array with images scores with images batch order.
-            comparing_group_size (int): The size of the groups into which the batch will be divided.
+            frames (list[np.ndarray]): Batch of images in numpy ndarray.
 
         Returns:
             list[np.ndarray]: Best images list.
         """
-        best_images = []
-        groups = np.array_split(scores, np.arange(comparing_group_size, len(scores), comparing_group_size))
+        normalized_images = self._normalize_images(frames, self._config.target_image_size)
+        scores = self._evaluate_images(normalized_images)
+        del normalized_images
+
+        best_frames = []
+        group_size = self._config.compering_group_size
+        groups = np.array_split(scores, np.arange(group_size, len(scores), group_size))
         for index, group in enumerate(groups):
             best_index = np.argmax(group)
-            global_index = index * comparing_group_size + best_index
-            best_images.append(images[global_index])
-        logger.info("Best frames selected(%s).", len(best_images))
-        return best_images
+            global_index = index * group_size + best_index
+            best_frames.append(frames[global_index])
+        logger.info("Best frames selected(%s).", len(best_frames))
+        return best_frames
 
 
 class TopImagesExtractor(Extractor):
diff --git a/extractor_service/app/tests/e2e/best_frames_extractor_api_test.py b/extractor_service/app/tests/e2e/best_frames_extractor_api_test.py
@@ -10,7 +10,7 @@ def test_best_frames_extractor_api(client, setup_best_frames_extractor_env):
         "output_directory": str(output_directory)
     }
 
-    response = client.post(f"/extractors/{extractor_name}", json=config)
+    response = client.post(f"/v2/extractors/{extractor_name}", json=config)
 
     assert response.status_code == 200
     assert response.json()["message"] == f"'{extractor_name}' started."
diff --git a/extractor_service/app/tests/e2e/frames_extractor_test.py b/extractor_service/app/tests/e2e/frames_extractor_test.py
@@ -11,7 +11,7 @@ def test_frames_extractor_api(client, setup_best_frames_extractor_env):
         "all_frames": True
     }
 
-    response = client.post(f"/extractors/{extractor_name}", json=config)
+    response = client.post(f"/v2/extractors/{extractor_name}", json=config)
 
     assert response.status_code == 200
     assert response.json()["message"] == f"'{extractor_name}' started."
diff --git a/extractor_service/app/tests/e2e/top_images_extractor_api_test.py b/extractor_service/app/tests/e2e/top_images_extractor_api_test.py
@@ -10,7 +10,7 @@ def test_top_images_extractor_api(client, setup_top_images_extractor_env):
         "output_directory": str(output_directory)
     }
 
-    response = client.post(f"/extractors/{extractor_name}", json=config)
+    response = client.post(f"/v2/extractors/{extractor_name}", json=config)
 
     assert response.status_code == 200
     assert response.json()["message"] == f"'{extractor_name}' started."
diff --git a/extractor_service/app/tests/integration/extractor_and_video_processor_integration_test.py b/extractor_service/app/tests/integration/extractor_and_video_processor_integration_test.py
@@ -1,18 +1,15 @@
-import numpy as np
-
-
-def test_extract_best_frames(extractor, files_dir, config):
-    entries = list(files_dir.iterdir())
+def test_extract_best_frames(extractor, config, setup_best_frames_extractor_env):
+    input_dir, output_dir, _ = setup_best_frames_extractor_env
+    entries = list(input_dir.iterdir())
     assert len(entries) > 0, "None entries in files_dir found"
     videos = [
         entry for entry in entries
         if entry.is_file() and entry.suffix in config.video_extensions
     ]
     assert len(list(videos)) > 0, "None videos in files_dir found"
+    assert not any(output_dir.iterdir()), "Output dir has entries before test"
 
     extractor._get_image_evaluator()
-    result = extractor._extract_best_frames(videos[0])
+    extractor._extract_best_frames(videos[0])
 
-    assert isinstance(result, list)
-    for frame in result:
-        assert isinstance(frame, np.ndarray)
+    assert any(output_dir.iterdir()), "Output dir is empty."
diff --git a/extractor_service/app/tests/unit/best_frames_extractor_test.py b/extractor_service/app/tests/unit/best_frames_extractor_test.py
@@ -9,6 +9,13 @@
 from app.video_processors import OpenCVVideo
 
 
+@pytest.fixture
+def all_frames_extractor(extractor):
+    extractor._config.all_frames = True
+    yield extractor
+    extractor._config.all_frames = False
+
+
 @pytest.fixture(scope="function")
 def extractor(config):
     extractor = BestFramesExtractor(config)
@@ -21,7 +28,6 @@ def test_process(extractor, caplog, config):
     extractor._list_input_directory_files = MagicMock(return_value=test_videos)
     extractor._get_image_evaluator = MagicMock()
     extractor._extract_best_frames = MagicMock(return_value=test_frames)
-    extractor._save_images = MagicMock()
     extractor._add_prefix = MagicMock()
     extractor._signal_readiness_for_shutdown = MagicMock()
 
@@ -32,92 +38,101 @@ def test_process(extractor, caplog, config):
         config.video_extensions, config.processed_video_prefix)
     extractor._get_image_evaluator.assert_called_once()
     assert extractor._extract_best_frames.call_count == len(test_videos)
-    assert extractor._save_images.call_count == len(test_videos)
     assert extractor._add_prefix.call_count == len(test_videos)
     extractor._signal_readiness_for_shutdown.assert_called_once()
     for video in test_videos:
         extractor._add_prefix.assert_any_call(config.processed_video_prefix, video)
         extractor._extract_best_frames.assert_any_call(video)
-        extractor._save_images.assert_any_call(test_frames)
         assert f"Frames extraction has finished for video: {video}" in caplog.text
     assert f"Starting frames extraction process from '{config.input_directory}'." in caplog.text
 
 
+def test_process_if_all_frames(extractor, caplog, config, all_frames_extractor):
+    test_videos = ["/fake/directory/video1.mp4", "/fake/directory/video2.mp4"]
+    test_frames = ["frame1", "frame2"]
+    extractor._list_input_directory_files = MagicMock(return_value=test_videos)
+    extractor._get_image_evaluator = MagicMock()
+    extractor._extract_best_frames = MagicMock(return_value=test_frames)
+    extractor._add_prefix = MagicMock()
+    extractor._signal_readiness_for_shutdown = MagicMock()
+
+    with caplog.at_level(logging.INFO):
+        extractor.process()
+
+    extractor._list_input_directory_files.assert_called_once_with(
+        config.video_extensions, config.processed_video_prefix)
+    extractor._get_image_evaluator.assert_not_called()
+    assert not extractor._image_evaluator
+    assert extractor._extract_best_frames.call_count == len(test_videos)
+    assert extractor._add_prefix.call_count == len(test_videos)
+    extractor._signal_readiness_for_shutdown.assert_called_once()
+    for video in test_videos:
+        extractor._add_prefix.assert_any_call(config.processed_video_prefix, video)
+        extractor._extract_best_frames.assert_any_call(video)
+        assert f"Frames extraction has finished for video: {video}" in caplog.text
+    assert f"Starting frames extraction process from '{config.input_directory}'." in caplog.text
+
+
+@patch("app.extractors.gc.collect")
+@patch.object(BestFramesExtractor, "_get_best_frames")
+@patch.object(BestFramesExtractor, "_save_images")
 @patch.object(OpenCVVideo, "get_next_frames")
-@patch.object(BestFramesExtractor, "_normalize_images")
-def test_extract_best_frames(mock_normalize, mock_get_next_frames, extractor, caplog):
-    video_path = Path("/fake/video.mp4")
-    frames_batch = [MagicMock() for _ in range(10)]
-    frames_batch_1 = frames_batch
-    frames_batch_2 = []
-    frames_batch_3 = frames_batch
-    mock_get_next_frames.return_value = iter([frames_batch_1, frames_batch_2, frames_batch_3])
-    normalized_frames_1 = MagicMock(spec=np.ndarray)
-    normalized_frames_2 = MagicMock(spec=np.ndarray)
-    mock_normalize.side_effect = [normalized_frames_1, normalized_frames_2]
-    test_ratings = [5, 6, 3, 8, 5, 2, 9, 1, 4, 7]
-    extractor._evaluate_images = MagicMock(return_value=test_ratings)
-    extractor._get_best_frames = MagicMock(
-        side_effect=lambda frames, ratings, group_size: [frames[i] for i in [3, 6]])
-
-    with caplog.at_level(logging.DEBUG):
-        best_frames = extractor._extract_best_frames(video_path)
-
-    mock_get_next_frames.assert_called_once_with(video_path, extractor._config.batch_size)
-    assert extractor._evaluate_images.call_count == 2
-    assert extractor._normalize_images.call_count == 2
-    assert extractor._get_best_frames.call_count == 2
-    assert len(best_frames) == 4
-    extractor._evaluate_images.assert_any_call(normalized_frames_1)
-    extractor._evaluate_images.assert_any_call(normalized_frames_2)
-    for batch in [frames_batch_1, frames_batch_3]:
-        extractor._get_best_frames.assert_any_call(
-            batch,
-            test_ratings,
-            extractor._config.compering_group_size
-        )
-    assert caplog.text.count("Frames batch generated.") == 2
+def test_extract_best_frames(mock_generator, mock_save, mock_get, mock_collect, extractor):
+    video_path = MagicMock(spec=Path)
 
+    batch_1 = [f"frame{i}" for i in range(5)]
+    batch_2 = []
+    batch_3 = [f"frame{i}" for i in range(5)]
+    mock_generator.return_value = iter([batch_1, batch_2, batch_3])
 
-@pytest.fixture
-def all_frames_extractor(extractor):
-    extractor._config.all_frames = True
-    yield extractor
-    extractor._config.all_frames = False
+    mock_get.side_effect = [batch_1, batch_3]
 
+    extractor._extract_best_frames(video_path)
 
-@patch.object(BestFramesExtractor, "_evaluate_images")
+    assert not extractor._config.all_frames
+    mock_generator.assert_called_once_with(video_path, extractor._config.batch_size)
+    assert mock_get.call_count == 2
+    for batch in [batch_1, batch_3]:
+        mock_save.assert_called_with(batch)
+    assert mock_collect.call_count == 2
+
+
+@patch("app.extractors.gc.collect")
 @patch.object(BestFramesExtractor, "_get_best_frames")
+@patch.object(BestFramesExtractor, "_save_images")
 @patch.object(OpenCVVideo, "get_next_frames")
+def test_extract_all_frames(mock_generator, mock_save, mock_get, mock_collect, all_frames_extractor):
+    video_path = MagicMock(spec=Path)
+
+    batch_1 = [f"frame{i}" for i in range(5)]
+    batch_2 = []
+    batch_3 = [f"frame{i}" for i in range(5)]
+    mock_generator.return_value = iter([batch_1, batch_2, batch_3])
+
+    all_frames_extractor._extract_best_frames(video_path)
+
+    assert all_frames_extractor._config.all_frames
+    mock_generator.assert_called_once_with(video_path, all_frames_extractor._config.batch_size)
+    assert mock_get.assert_not_called
+    for batch in [batch_1, batch_3]:
+        mock_save.assert_called_with(batch)
+    assert mock_collect.call_count == 2
+
+
 @patch.object(BestFramesExtractor, "_normalize_images")
-def test_extract_all_frames(mock_normalize, mock_get_next_frames,
-                            mock_get, mock_evaluate, all_frames_extractor, caplog):
-    video_path = Path("/fake/video.mp4")
-    frames_batch = [MagicMock() for _ in range(3)]
-    frames_batch_1 = frames_batch
-    frames_batch_2 = []
-    frames_batch_3 = frames_batch
-    mock_get_next_frames.return_value = iter([frames_batch_1, frames_batch_2, frames_batch_3])
-
-    with caplog.at_level(logging.DEBUG):
-        best_frames = all_frames_extractor._extract_best_frames(video_path)
-
-    mock_get_next_frames.assert_called_once_with(video_path, all_frames_extractor._config.batch_size)
-    assert len(best_frames) == 6
-    mock_evaluate.assert_not_called()
-    mock_normalize.assert_not_called()
-    mock_get.assert_not_called()
-    assert caplog.text.count("Frames batch generated.") == 2
-
-
-def test_get_best_frames(caplog, extractor):
-    images = [MagicMock(spec=np.ndarray) for _ in range(10)]
-    ratings = np.array([7, 2, 9, 3, 8, 5, 10, 1, 4, 6])
-    batch_size = 3
-    expected_best_images = [images[2], images[4], images[6], images[9]]
+@patch.object(BestFramesExtractor, "_evaluate_images")
+def test_get_best_frames(mock_evaluate, mock_normalize, caplog, extractor, config):
+    frames = [f"frames{i}" for i in range(10)]
+    scores = np.array([7, 2, 9, 3, 8, 5, 10, 1, 4, 6])
+    normalized_images = [MagicMock() for _ in range(10)]
+    mock_normalize.return_value = normalized_images
+    mock_evaluate.return_value = scores
+    expected_best_images = [frames[2], frames[6]]
 
     with caplog.at_level(logging.INFO):
-        best_images = extractor._get_best_frames(images, ratings, batch_size)
+        best_images = extractor._get_best_frames(frames)
 
+    mock_evaluate.assert_called_once_with(normalized_images)
+    mock_normalize.assert_called_once_with(frames, config.target_image_size)
     assert best_images == expected_best_images
     assert f"Best frames selected({len(expected_best_images)})." in caplog.text
diff --git a/extractor_service/main.py b/extractor_service/main.py
@@ -41,7 +41,7 @@
 app = FastAPI()
 
 
-@app.get("/status")
+@app.get("/v2/status")
 def get_extractors_status() -> ExtractorStatus:
     """
     Checks is some extractor already running on service.
@@ -52,7 +52,7 @@ def get_extractors_status() -> ExtractorStatus:
     return ExtractorStatus(active_extractor=ExtractorManager.get_active_extractor())
 
 
-@app.post("/extractors/{extractor_name}")
+@app.post("/v2/extractors/{extractor_name}")
 def run_extractor(background_tasks: BackgroundTasks, extractor_name: str,
                   config: ExtractorConfig = ExtractorConfig()) -> Message:
     """
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,13 @@
 [tool.poetry]
 name = "PerfectFrameAI"
-version = "2.1.0"
+version = "2.2.0"
 description = "AI tool for finding the most aesthetic frames in a video. 🎞️➜🖼️"
 authors = ["Bartłomiej Flis <Bartekdawidflis@gmail.com>"]
 license = "GPL-3.0 license"
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "^3.10"
+python = ">=3.10,<3.12"
 fastapi = "^0.111.0"
 uvicorn = "^0.29.0"
 numpy = "^1.26.4"
diff --git a/service_manager/service_initializer.py b/service_manager/service_initializer.py
@@ -66,7 +66,7 @@ def _check_directory(directory: str) -> Path:
     def run_extractor(self, extractor_url: Union[str, None] = None) -> None:
         """Send POST request to local port extractor service to start chosen extractor."""
         if not extractor_url:
-            extractor_url = f"http://localhost:{self._port}/extractors/{self._extractor_name}"
+            extractor_url = f"http://localhost:{self._port}/v2/extractors/{self._extractor_name}"
         json_data = {"all_frames": self._all_frames}
         req = Request(
             extractor_url, method="POST",
diff --git a/service_manager/tests/unit/service_initializer_test.py b/service_manager/tests/unit/service_initializer_test.py
@@ -72,7 +72,7 @@ def test_check_valid_directory():
 
 @patch.object(time, "time")
 def test_run_extractor_post_request(mock_time, service):
-    test_url = f"http://localhost:{service._port}/extractors/{service._extractor_name}"
+    test_url = f"http://localhost:{service._port}/v2/extractors/{service._extractor_name}"
     test_method = "POST"
     start_time = 100
     mock_time.side_effect = [start_time, start_time + 1, start_time + 2, start_time + 3]

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ def test_best_frames_extractor_api(client, setup_best_frames_extractor_env):`
`10`	`10`	`"output_directory": str(output_directory)`
`11`	`11`	`}`
`12`	`12`
`13`		`- response = client.post(f"/extractors/{extractor_name}", json=config)`
	`13`	`+ response = client.post(f"/v2/extractors/{extractor_name}", json=config)`
`14`	`14`
`15`	`15`	`assert response.status_code == 200`
`16`	`16`	`assert response.json()["message"] == f"'{extractor_name}' started."`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ def test_frames_extractor_api(client, setup_best_frames_extractor_env):`
`11`	`11`	`"all_frames": True`
`12`	`12`	`}`
`13`	`13`
`14`		`- response = client.post(f"/extractors/{extractor_name}", json=config)`
	`14`	`+ response = client.post(f"/v2/extractors/{extractor_name}", json=config)`
`15`	`15`
`16`	`16`	`assert response.status_code == 200`
`17`	`17`	`assert response.json()["message"] == f"'{extractor_name}' started."`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ def test_top_images_extractor_api(client, setup_top_images_extractor_env):`
`10`	`10`	`"output_directory": str(output_directory)`
`11`	`11`	`}`
`12`	`12`
`13`		`- response = client.post(f"/extractors/{extractor_name}", json=config)`
	`13`	`+ response = client.post(f"/v2/extractors/{extractor_name}", json=config)`
`14`	`14`
`15`	`15`	`assert response.status_code == 200`
`16`	`16`	`assert response.json()["message"] == f"'{extractor_name}' started."`