From d56154d33a0746f6434bcf347e728c882bfc7106 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 10:18:57 +0100 Subject: [PATCH 01/79] CU-8699049kf: Bump requirement to v2 (0.3.3) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a14b289..5ce162b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy<3.8.0 -medcat~=1.12.0 +medcat2[meta-cat] @ git+https://github.com/CogStack/MedCAT2@v0.3.3 plotly~=5.19.0 eland==8.12.1 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl From dcdd984e75c46a88d682b4e88712088731e3399d Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 14:16:53 +0100 Subject: [PATCH 02/79] CU-8698up3x0: Add optional extras needed for WWC --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5ce162b..9b48f7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy<3.8.0 -medcat2[meta-cat] @ git+https://github.com/CogStack/MedCAT2@v0.3.3 +medcat2[meta-cat,spacy,deid] @ git+https://github.com/CogStack/MedCAT2@v0.3.3 plotly~=5.19.0 eland==8.12.1 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl From cf3cfbcc281607fa823c33282cea6013283188cd Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 15:22:35 +0100 Subject: [PATCH 03/79] CU-8699049kf: Update requirements to work with newer packages. There doens't exist a verison of eland that supports numpy 2, so can't install that alongside medcat v2. Spacy 3.8+ needed for pydantic2 support. --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9b48f7f..89ee849 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -spacy<3.8.0 +spacy>=3.8.0,<4.0 medcat2[meta-cat,spacy,deid] @ git+https://github.com/CogStack/MedCAT2@v0.3.3 plotly~=5.19.0 -eland==8.12.1 -en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl +# eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 +en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl ipyfilechooser jupyter_contrib_nbextensions From 03e9814cf2c137117bbf9d27df543d881a6b96c7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 15:24:14 +0100 Subject: [PATCH 04/79] CU-8699049kf: Add compatibility layer as a package --- compatibility_package/medcat.py | 29 +++++++++++++++++++++++++++++ compatibility_package/setup.py | 9 +++++++++ 2 files changed, 38 insertions(+) create mode 100644 compatibility_package/medcat.py create mode 100644 compatibility_package/setup.py diff --git a/compatibility_package/medcat.py b/compatibility_package/medcat.py new file mode 100644 index 0000000..9cee220 --- /dev/null +++ b/compatibility_package/medcat.py @@ -0,0 +1,29 @@ +import sys +import importlib +import medcat2 + +# Copy all attributes from medcat2 to this module +for attr in dir(medcat2): + if not attr.startswith('__'): + globals()[attr] = getattr(medcat2, attr) + + +# Set up submodule redirections +class SubmoduleProxy: + def __init__(self, target_module_name): + self.target_module_name = target_module_name + + def __getattr__(self, name): + return getattr(importlib.import_module(self.target_module_name), name) + + +# For each submodule in medcat2, create a proxy in sys.modules +for module_name in list(sys.modules.keys()): + if (module_name.startswith('medcat2.') and + not module_name.startswith('medcat.')): + submodule_name = module_name.replace('medcat2.', 'medcat.', 1) + elif module_name == 'medcat2': + submodule_name = 'medcat' + else: + continue + sys.modules[submodule_name] = SubmoduleProxy(module_name) diff --git a/compatibility_package/setup.py b/compatibility_package/setup.py new file mode 100644 index 0000000..12f2205 --- /dev/null +++ b/compatibility_package/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup + +setup( + name="medcat", + version="2.0.0-beta", + description="Compatibility layer for medcat2", + py_modules=["medcat"], + install_requires=["medcat2"], +) From 0e47df42f16f849bf0afa9cbc127859dc8c3cfa9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 15:30:17 +0100 Subject: [PATCH 05/79] CU-8699049kf: Install compatibility layer with requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 89ee849..eb25b48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ plotly~=5.19.0 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl ipyfilechooser jupyter_contrib_nbextensions +-e ./compatibility_package From 4001e88414defcf85f8333f0806f32feaaaeaa75 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 15:33:55 +0100 Subject: [PATCH 06/79] CU-8699049kf: Run workflow on Ubuntu 24.04 instead of EoL 20.04 --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d7fa37f..46fa004 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,7 +9,7 @@ on: jobs: native-py: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 strategy: matrix: python-version: [ '3.8', '3.9', '3.10', '3.11' ] From 7690236646cbcf4b65046531dded14fe1de5fd3d Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 15:34:21 +0100 Subject: [PATCH 07/79] CU-8699049kf: Run wokrflow on 3.9>=python>=3.12 --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 46fa004..c90fb5c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-24.04 strategy: matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11' ] + python-version: [ '3.9', '3.10', '3.11', "3.12" ] max-parallel: 4 steps: From a8983cb0a97f35466bec77bfa5b51ea516830bec Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 16:09:48 +0100 Subject: [PATCH 08/79] CU-8699049kf: Add type ignoring in compatibility package --- compatibility_package/medcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compatibility_package/medcat.py b/compatibility_package/medcat.py index 9cee220..46974e9 100644 --- a/compatibility_package/medcat.py +++ b/compatibility_package/medcat.py @@ -26,4 +26,4 @@ def __getattr__(self, name): submodule_name = 'medcat' else: continue - sys.modules[submodule_name] = SubmoduleProxy(module_name) + sys.modules[submodule_name] = SubmoduleProxy(module_name) # type: ignore From 2ea56500e96a3b81d90274e13802fe2b9016eff4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 16:24:33 +0100 Subject: [PATCH 09/79] CU-8699049kf: Add custom test runner --- tests/custom_test_runner.py | 63 +++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 tests/custom_test_runner.py diff --git a/tests/custom_test_runner.py b/tests/custom_test_runner.py new file mode 100644 index 0000000..7effaa4 --- /dev/null +++ b/tests/custom_test_runner.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +""" +Custom test runner that ensures the compatibility layer is loaded +before unittest discovers and runs tests. +""" +import sys +import unittest +import importlib + +# First, ensure medcat compatibility is set up +try: + import medcat +except ImportError: + try: + import medcat2 + + # Manually set up the redirection + print("Setting up medcat compatibility layer...") + + # Create module for medcat + import types + medcat_module = types.ModuleType('medcat') + sys.modules['medcat'] = medcat_module + + # Copy attributes + for attr in dir(medcat2): + if not attr.startswith('__'): + setattr(medcat_module, attr, getattr(medcat2, attr)) + + # Set up submodule proxies + class SubmoduleProxy: + def __init__(self, target_module_name): + self.target_module_name = target_module_name + + def __getattr__(self, name): + return getattr(importlib.import_module(self.target_module_name), name) + + # Add proxies for submodules + for module_name in list(sys.modules.keys()): + if module_name.startswith('medcat2.'): + submodule_name = module_name.replace('medcat2.', 'medcat.', 1) + elif module_name == 'medcat2': + submodule_name = 'medcat' + else: + continue + sys.modules[submodule_name] = SubmoduleProxy(module_name) # type: ignore + except ImportError: + print("Warning: Neither medcat nor medcat2 could be imported") + +# Now run the tests +if __name__ == '__main__': + # Get all tests + test_loader = unittest.TestLoader() + + # You can customize the test discovery path here + test_suite = test_loader.discover('tests') + + # Run the tests + test_runner = unittest.TextTestRunner(verbosity=2) + result = test_runner.run(test_suite) + + # Return non-zero exit code if tests failed + sys.exit(not result.wasSuccessful()) From 232cebd18098780fb3c975c27455f379779fb54f Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 16:33:04 +0100 Subject: [PATCH 10/79] CU-8699049kf: Update custom runner to allow for specific test locations --- tests/custom_test_runner.py | 44 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/tests/custom_test_runner.py b/tests/custom_test_runner.py index 7effaa4..2d7c3a1 100644 --- a/tests/custom_test_runner.py +++ b/tests/custom_test_runner.py @@ -4,8 +4,10 @@ before unittest discovers and runs tests. """ import sys +import os import unittest import importlib +import argparse # First, ensure medcat compatibility is set up try: @@ -13,20 +15,20 @@ except ImportError: try: import medcat2 - + # Manually set up the redirection print("Setting up medcat compatibility layer...") - + # Create module for medcat import types medcat_module = types.ModuleType('medcat') sys.modules['medcat'] = medcat_module - + # Copy attributes for attr in dir(medcat2): if not attr.startswith('__'): setattr(medcat_module, attr, getattr(medcat2, attr)) - + # Set up submodule proxies class SubmoduleProxy: def __init__(self, target_module_name): @@ -34,7 +36,7 @@ def __init__(self, target_module_name): def __getattr__(self, name): return getattr(importlib.import_module(self.target_module_name), name) - + # Add proxies for submodules for module_name in list(sys.modules.keys()): if module_name.startswith('medcat2.'): @@ -49,14 +51,34 @@ def __getattr__(self, name): # Now run the tests if __name__ == '__main__': - # Get all tests + # Parse arguments to mimic unittest discover behavior + parser = argparse.ArgumentParser(description='Run tests with compatibility layer') + parser.add_argument('-s', '--start-directory', default='tests', + help='Directory to start discovery (default: tests)') + parser.add_argument('-p', '--pattern', default='test*.py', + help='Pattern to match test files (default: test*.py)') + parser.add_argument('-t', '--top-level-directory', default=None, + help='Top level directory of project (default: None)') + parser.add_argument('--verbosity', '-v', type=int, default=2, + help='Verbosity level (default: 2)') + + args = parser.parse_args() + + # Ensure the start directory exists + if not os.path.isdir(args.start_directory): + print(f"Error: Start directory '{args.start_directory}' does not exist") + sys.exit(1) + + # Get all tests using the specified parameters test_loader = unittest.TestLoader() - - # You can customize the test discovery path here - test_suite = test_loader.discover('tests') - + test_suite = test_loader.discover( + start_dir=args.start_directory, + pattern=args.pattern, + top_level_dir=args.top_level_directory + ) + # Run the tests - test_runner = unittest.TextTestRunner(verbosity=2) + test_runner = unittest.TextTestRunner(verbosity=args.verbosity) result = test_runner.run(test_suite) # Return non-zero exit code if tests failed From 027fc926753977c31f5c3b735c5c97063cc53a49 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 16:33:38 +0100 Subject: [PATCH 11/79] CU-8699049kf: Use custom test runner in workflow --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c90fb5c..dc2355b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,8 +34,8 @@ jobs: python -m mypy `git ls-tree --full-tree --name-only -r HEAD | grep ".py$" | grep -v "tests/"` --explicit-package-bases --follow-imports=normal - name: Test run: | - python -m unittest discover - python -m unittest discover -s medcat/compare_models + python tests/run_tests.py + python tests/run_tests.py -s medcat/compare_models # TODO - in the future, we might want to add automated tests for notebooks as well # though it's not really possible right now since the notebooks are designed # in a way that assumes interaction (i.e specifying model pack names) From 126831e598c4ab11649c6d7f004fe3f68759693a Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 16:39:30 +0100 Subject: [PATCH 12/79] CU-8699049kf: Fix custom test runner name in workflow --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dc2355b..4b6c311 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,8 +34,8 @@ jobs: python -m mypy `git ls-tree --full-tree --name-only -r HEAD | grep ".py$" | grep -v "tests/"` --explicit-package-bases --follow-imports=normal - name: Test run: | - python tests/run_tests.py - python tests/run_tests.py -s medcat/compare_models + python tests/custom_test_runner.py + python tests/custom_test_runner.py -s medcat/compare_models # TODO - in the future, we might want to add automated tests for notebooks as well # though it's not really possible right now since the notebooks are designed # in a way that assumes interaction (i.e specifying model pack names) From 2bde16f8c51a64586a58bb9055655e11c0b3ec05 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 17:14:28 +0100 Subject: [PATCH 13/79] CU-8699049kf: Move test runner to a different folder to fix issues --- tests/{ => runner}/custom_test_runner.py | 43 ++++-------------------- 1 file changed, 6 insertions(+), 37 deletions(-) rename tests/{ => runner}/custom_test_runner.py (55%) diff --git a/tests/custom_test_runner.py b/tests/runner/custom_test_runner.py similarity index 55% rename from tests/custom_test_runner.py rename to tests/runner/custom_test_runner.py index 2d7c3a1..f12424f 100644 --- a/tests/custom_test_runner.py +++ b/tests/runner/custom_test_runner.py @@ -10,44 +10,13 @@ import argparse # First, ensure medcat compatibility is set up -try: - import medcat -except ImportError: - try: - import medcat2 +import medcat +print("medcat?", medcat, ":", dir(medcat)) +import medcat.cat +import medcat.vocab +import medcat.cdb +print("Loaded medcat") - # Manually set up the redirection - print("Setting up medcat compatibility layer...") - - # Create module for medcat - import types - medcat_module = types.ModuleType('medcat') - sys.modules['medcat'] = medcat_module - - # Copy attributes - for attr in dir(medcat2): - if not attr.startswith('__'): - setattr(medcat_module, attr, getattr(medcat2, attr)) - - # Set up submodule proxies - class SubmoduleProxy: - def __init__(self, target_module_name): - self.target_module_name = target_module_name - - def __getattr__(self, name): - return getattr(importlib.import_module(self.target_module_name), name) - - # Add proxies for submodules - for module_name in list(sys.modules.keys()): - if module_name.startswith('medcat2.'): - submodule_name = module_name.replace('medcat2.', 'medcat.', 1) - elif module_name == 'medcat2': - submodule_name = 'medcat' - else: - continue - sys.modules[submodule_name] = SubmoduleProxy(module_name) # type: ignore - except ImportError: - print("Warning: Neither medcat nor medcat2 could be imported") # Now run the tests if __name__ == '__main__': From 8edb65cce48113867f0e1c73084ac8c10e66c205 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 13 May 2025 17:16:52 +0100 Subject: [PATCH 14/79] CU-8699049kf: Remove redundant code / imports --- tests/runner/custom_test_runner.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/runner/custom_test_runner.py b/tests/runner/custom_test_runner.py index f12424f..fc71cc9 100644 --- a/tests/runner/custom_test_runner.py +++ b/tests/runner/custom_test_runner.py @@ -10,12 +10,7 @@ import argparse # First, ensure medcat compatibility is set up -import medcat -print("medcat?", medcat, ":", dir(medcat)) -import medcat.cat -import medcat.vocab -import medcat.cdb -print("Loaded medcat") +import medcat # noqa # Now run the tests From 7f0380ad58d5efb4134cf62ac8aeba657546a3eb Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 11:33:43 +0100 Subject: [PATCH 15/79] CU-8699049kf: Move tests to a different folder for namespaces reasons --- .../{medcat => tmedcat}/1_create_model/__init__.py | 0 .../1_create_model/create_cdb/__init__.py | 0 .../1_create_model/create_cdb/test_create_cdb.py | 0 .../1_create_model/create_modelpack/__init__.py | 0 .../create_modelpack/test_create_modelpack.py | 0 .../1_create_model/create_vocab/__init__.py | 0 .../create_vocab/test_create_vocab.py | 0 .../1_unsupervised_training/__init__.py | 0 .../1_unsupervised_training/test_splitter.py | 0 tests/{medcat => tmedcat}/2_train_model/__init__.py | 0 tests/{medcat => tmedcat}/__init__.py | 0 .../evaluate_mct_export/__init__.py | 0 .../offline_test_mct_analysis.py | 0 .../evaluate_mct_export/test_mct_analysis.py | 0 .../resources/MCT_export_example.json | 0 tests/{medcat => tmedcat}/resources/cdb.dat | Bin .../resources/example_cdb_input_snomed.csv | 0 .../resources/example_cdb_input_umls.csv | 0 .../resources/example_file_to_split.csv | 0 tests/{medcat => tmedcat}/resources/vocab.dat | Bin 20 files changed, 0 insertions(+), 0 deletions(-) rename tests/{medcat => tmedcat}/1_create_model/__init__.py (100%) rename tests/{medcat => tmedcat}/1_create_model/create_cdb/__init__.py (100%) rename tests/{medcat => tmedcat}/1_create_model/create_cdb/test_create_cdb.py (100%) rename tests/{medcat => tmedcat}/1_create_model/create_modelpack/__init__.py (100%) rename tests/{medcat => tmedcat}/1_create_model/create_modelpack/test_create_modelpack.py (100%) rename tests/{medcat => tmedcat}/1_create_model/create_vocab/__init__.py (100%) rename tests/{medcat => tmedcat}/1_create_model/create_vocab/test_create_vocab.py (100%) rename tests/{medcat => tmedcat}/2_train_model/1_unsupervised_training/__init__.py (100%) rename tests/{medcat => tmedcat}/2_train_model/1_unsupervised_training/test_splitter.py (100%) rename tests/{medcat => tmedcat}/2_train_model/__init__.py (100%) rename tests/{medcat => tmedcat}/__init__.py (100%) rename tests/{medcat => tmedcat}/evaluate_mct_export/__init__.py (100%) rename tests/{medcat => tmedcat}/evaluate_mct_export/offline_test_mct_analysis.py (100%) rename tests/{medcat => tmedcat}/evaluate_mct_export/test_mct_analysis.py (100%) rename tests/{medcat => tmedcat}/resources/MCT_export_example.json (100%) rename tests/{medcat => tmedcat}/resources/cdb.dat (100%) rename tests/{medcat => tmedcat}/resources/example_cdb_input_snomed.csv (100%) rename tests/{medcat => tmedcat}/resources/example_cdb_input_umls.csv (100%) rename tests/{medcat => tmedcat}/resources/example_file_to_split.csv (100%) rename tests/{medcat => tmedcat}/resources/vocab.dat (100%) diff --git a/tests/medcat/1_create_model/__init__.py b/tests/tmedcat/1_create_model/__init__.py similarity index 100% rename from tests/medcat/1_create_model/__init__.py rename to tests/tmedcat/1_create_model/__init__.py diff --git a/tests/medcat/1_create_model/create_cdb/__init__.py b/tests/tmedcat/1_create_model/create_cdb/__init__.py similarity index 100% rename from tests/medcat/1_create_model/create_cdb/__init__.py rename to tests/tmedcat/1_create_model/create_cdb/__init__.py diff --git a/tests/medcat/1_create_model/create_cdb/test_create_cdb.py b/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py similarity index 100% rename from tests/medcat/1_create_model/create_cdb/test_create_cdb.py rename to tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py diff --git a/tests/medcat/1_create_model/create_modelpack/__init__.py b/tests/tmedcat/1_create_model/create_modelpack/__init__.py similarity index 100% rename from tests/medcat/1_create_model/create_modelpack/__init__.py rename to tests/tmedcat/1_create_model/create_modelpack/__init__.py diff --git a/tests/medcat/1_create_model/create_modelpack/test_create_modelpack.py b/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py similarity index 100% rename from tests/medcat/1_create_model/create_modelpack/test_create_modelpack.py rename to tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py diff --git a/tests/medcat/1_create_model/create_vocab/__init__.py b/tests/tmedcat/1_create_model/create_vocab/__init__.py similarity index 100% rename from tests/medcat/1_create_model/create_vocab/__init__.py rename to tests/tmedcat/1_create_model/create_vocab/__init__.py diff --git a/tests/medcat/1_create_model/create_vocab/test_create_vocab.py b/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py similarity index 100% rename from tests/medcat/1_create_model/create_vocab/test_create_vocab.py rename to tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py diff --git a/tests/medcat/2_train_model/1_unsupervised_training/__init__.py b/tests/tmedcat/2_train_model/1_unsupervised_training/__init__.py similarity index 100% rename from tests/medcat/2_train_model/1_unsupervised_training/__init__.py rename to tests/tmedcat/2_train_model/1_unsupervised_training/__init__.py diff --git a/tests/medcat/2_train_model/1_unsupervised_training/test_splitter.py b/tests/tmedcat/2_train_model/1_unsupervised_training/test_splitter.py similarity index 100% rename from tests/medcat/2_train_model/1_unsupervised_training/test_splitter.py rename to tests/tmedcat/2_train_model/1_unsupervised_training/test_splitter.py diff --git a/tests/medcat/2_train_model/__init__.py b/tests/tmedcat/2_train_model/__init__.py similarity index 100% rename from tests/medcat/2_train_model/__init__.py rename to tests/tmedcat/2_train_model/__init__.py diff --git a/tests/medcat/__init__.py b/tests/tmedcat/__init__.py similarity index 100% rename from tests/medcat/__init__.py rename to tests/tmedcat/__init__.py diff --git a/tests/medcat/evaluate_mct_export/__init__.py b/tests/tmedcat/evaluate_mct_export/__init__.py similarity index 100% rename from tests/medcat/evaluate_mct_export/__init__.py rename to tests/tmedcat/evaluate_mct_export/__init__.py diff --git a/tests/medcat/evaluate_mct_export/offline_test_mct_analysis.py b/tests/tmedcat/evaluate_mct_export/offline_test_mct_analysis.py similarity index 100% rename from tests/medcat/evaluate_mct_export/offline_test_mct_analysis.py rename to tests/tmedcat/evaluate_mct_export/offline_test_mct_analysis.py diff --git a/tests/medcat/evaluate_mct_export/test_mct_analysis.py b/tests/tmedcat/evaluate_mct_export/test_mct_analysis.py similarity index 100% rename from tests/medcat/evaluate_mct_export/test_mct_analysis.py rename to tests/tmedcat/evaluate_mct_export/test_mct_analysis.py diff --git a/tests/medcat/resources/MCT_export_example.json b/tests/tmedcat/resources/MCT_export_example.json similarity index 100% rename from tests/medcat/resources/MCT_export_example.json rename to tests/tmedcat/resources/MCT_export_example.json diff --git a/tests/medcat/resources/cdb.dat b/tests/tmedcat/resources/cdb.dat similarity index 100% rename from tests/medcat/resources/cdb.dat rename to tests/tmedcat/resources/cdb.dat diff --git a/tests/medcat/resources/example_cdb_input_snomed.csv b/tests/tmedcat/resources/example_cdb_input_snomed.csv similarity index 100% rename from tests/medcat/resources/example_cdb_input_snomed.csv rename to tests/tmedcat/resources/example_cdb_input_snomed.csv diff --git a/tests/medcat/resources/example_cdb_input_umls.csv b/tests/tmedcat/resources/example_cdb_input_umls.csv similarity index 100% rename from tests/medcat/resources/example_cdb_input_umls.csv rename to tests/tmedcat/resources/example_cdb_input_umls.csv diff --git a/tests/medcat/resources/example_file_to_split.csv b/tests/tmedcat/resources/example_file_to_split.csv similarity index 100% rename from tests/medcat/resources/example_file_to_split.csv rename to tests/tmedcat/resources/example_file_to_split.csv diff --git a/tests/medcat/resources/vocab.dat b/tests/tmedcat/resources/vocab.dat similarity index 100% rename from tests/medcat/resources/vocab.dat rename to tests/tmedcat/resources/vocab.dat From 6d138e9c9fac6e63c0d9db18357cad63cc26f6fc Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 11:39:22 +0100 Subject: [PATCH 16/79] CU-8699049kf: Fix test-data location --- .../2_train_model/1_unsupervised_training/test_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tmedcat/2_train_model/1_unsupervised_training/test_splitter.py b/tests/tmedcat/2_train_model/1_unsupervised_training/test_splitter.py index f336674..73dae89 100644 --- a/tests/tmedcat/2_train_model/1_unsupervised_training/test_splitter.py +++ b/tests/tmedcat/2_train_model/1_unsupervised_training/test_splitter.py @@ -19,7 +19,7 @@ import splitter -FILE_TO_SPLIT = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_file_to_split.csv") +FILE_TO_SPLIT = os.path.join(_WWC_BASE_FOLDER, "tests", "tmedcat", "resources", "example_file_to_split.csv") NR_OF_LINES_IN_FILE = 125 NR_OF_COLUMNS_IN_FILE = 20 From a02dbc0b4ab24571f98e883f5e14f4dfbb4aa121 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 11:49:04 +0100 Subject: [PATCH 17/79] CU-8699049kf: Fix test-data location (CDB creation) --- tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py b/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py index cd734c7..911cccd 100644 --- a/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py +++ b/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py @@ -18,8 +18,8 @@ from unittest.mock import patch # SNOMED pre-cdb csv -PRE_CDB_CSV_PATH_SNOMED = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_cdb_input_snomed.csv") -PRE_CDB_CSV_PATH_UMLS = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_cdb_input_umls.csv") +PRE_CDB_CSV_PATH_SNOMED = os.path.join(_WWC_BASE_FOLDER, "tests", "tmedcat", "resources", "example_cdb_input_snomed.csv") +PRE_CDB_CSV_PATH_UMLS = os.path.join(_WWC_BASE_FOLDER, "tests", "tmedcat", "resources", "example_cdb_input_umls.csv") def get_mock_input(output: str): From f07ac28e07390c34045479bcf76ec94f86a54464 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 11:50:30 +0100 Subject: [PATCH 18/79] CU-8699049kf: Add manual relocation of packages --- compatibility_package/medcat.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/compatibility_package/medcat.py b/compatibility_package/medcat.py index 46974e9..32cb016 100644 --- a/compatibility_package/medcat.py +++ b/compatibility_package/medcat.py @@ -17,6 +17,15 @@ def __getattr__(self, name): return getattr(importlib.import_module(self.target_module_name), name) +manual_changes = { + "medcat.tokenizers.meta_cat_tokenizers": "medcat2.components.addons.meta_cat.mctokenizers.tokenizers", + "medcat.cdb_maker": "medcat2.model_creation.cdb_maker", + "medcat.utils.meta_cat": "medcat2.components.addons.meta_cat", + "medcat.meta_cat": "medcat2.components.addons.meta_cat.meta_cat", + "medcat.config_meta_cat": "medcat2.config.config_meta_cat", +} + + # For each submodule in medcat2, create a proxy in sys.modules for module_name in list(sys.modules.keys()): if (module_name.startswith('medcat2.') and @@ -27,3 +36,6 @@ def __getattr__(self, name): else: continue sys.modules[submodule_name] = SubmoduleProxy(module_name) # type: ignore + +for module_name, replacement_module_name in manual_changes.items(): + sys.modules[module_name] = SubmoduleProxy(replacement_module_name) # type: ignore From 3f9399d30cb7f3de843975f1d78353020b90b51c Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:21:35 +0100 Subject: [PATCH 19/79] CU-8699049kf: Fix test resources path (create modelpack) --- .../1_create_model/create_modelpack/test_create_modelpack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py b/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py index 6e789a1..7a25204 100644 --- a/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py +++ b/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py @@ -21,7 +21,7 @@ import create_modelpack -RESOURCES_FOLDER = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources") +RESOURCES_FOLDER = os.path.join(_WWC_BASE_FOLDER, "tests", "tmedcat", "resources") DEFAULT_CDB_PATH = os.path.join(RESOURCES_FOLDER, "cdb.dat") DEFAULT_VOCAB_PATH = os.path.join(RESOURCES_FOLDER, "vocab.dat") From eadcdc968db651771215bca65fa18749bc6df461 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:22:05 +0100 Subject: [PATCH 20/79] CU-8699049kf: Use medcat2-based modelpack load/save code --- medcat/1_create_model/create_modelpack/create_modelpack.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/medcat/1_create_model/create_modelpack/create_modelpack.py b/medcat/1_create_model/create_modelpack/create_modelpack.py index 949e681..d13cb76 100644 --- a/medcat/1_create_model/create_modelpack/create_modelpack.py +++ b/medcat/1_create_model/create_modelpack/create_modelpack.py @@ -3,6 +3,7 @@ from medcat.vocab import Vocab from medcat.cdb import CDB from medcat.cat import CAT +from medcat.storage.serialisers import deserialise # relative to file path _FILE_DIR = os.path.dirname(__file__) @@ -39,7 +40,7 @@ def load_cdb_and_save_modelpack(cdb_path: str, str: The model pack path. """ # Load cdb - cdb = CDB.load(cdb_path) + cdb: CDB = deserialise(cdb_path) # Set cdb configuration # technically we already created this during the cdb creation @@ -53,7 +54,7 @@ def load_cdb_and_save_modelpack(cdb_path: str, cdb.config.general['full_unlink'] = True # Load vocab - vocab = Vocab.load(vocab_path) + vocab: Vocab = deserialise(vocab_path) # Initialise the model cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) From 7e9a20def35fed7a227bb2bb4c240542aace30d8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:23:08 +0100 Subject: [PATCH 21/79] CU-8699049kf: Use medcat2-based Vocab load/save code --- medcat/1_create_model/create_vocab/create_vocab.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/medcat/1_create_model/create_vocab/create_vocab.py b/medcat/1_create_model/create_vocab/create_vocab.py index 0d74668..75ff1d9 100644 --- a/medcat/1_create_model/create_vocab/create_vocab.py +++ b/medcat/1_create_model/create_vocab/create_vocab.py @@ -1,4 +1,5 @@ from medcat.vocab import Vocab +from medcat.storage.serialisers import serialise, AvailableSerialisers import os vocab = Vocab() @@ -17,5 +18,5 @@ # embeddings of 300 dimensions is standard vocab.add_words(os.path.join(vocab_dir, 'vocab_data.txt'), replace=True) -vocab.make_unigram_table() -vocab.save(os.path.join(vocab_dir, "vocab.dat")) +serialise(AvailableSerialisers.dill, vocab, + os.path.join(vocab_dir, "vocab.dat")) From 5aa4928f76253f2a2d1d24779969cad5525a5c3a Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:38:36 +0100 Subject: [PATCH 22/79] CU-8699049kf: Make sure to create directory before saving in it (Vocab) --- medcat/1_create_model/create_vocab/create_vocab.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/medcat/1_create_model/create_vocab/create_vocab.py b/medcat/1_create_model/create_vocab/create_vocab.py index 75ff1d9..5487622 100644 --- a/medcat/1_create_model/create_vocab/create_vocab.py +++ b/medcat/1_create_model/create_vocab/create_vocab.py @@ -18,5 +18,6 @@ # embeddings of 300 dimensions is standard vocab.add_words(os.path.join(vocab_dir, 'vocab_data.txt'), replace=True) -serialise(AvailableSerialisers.dill, vocab, - os.path.join(vocab_dir, "vocab.dat")) +vocab_folder = os.path.join(vocab_dir, "vocab.dat") +os.makedirs(vocab_folder, exist_ok=True) +serialise(AvailableSerialisers.dill, vocab, vocab_folder) From 5d163d6c014d1ac68d671e6a749518b367f0b5ee Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:39:35 +0100 Subject: [PATCH 23/79] CU-8699049kf: Add automatic legacy conversion of CDB and Vocab to compatibility layer --- compatibility_package/medcat.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/compatibility_package/medcat.py b/compatibility_package/medcat.py index 32cb016..5b1a918 100644 --- a/compatibility_package/medcat.py +++ b/compatibility_package/medcat.py @@ -1,6 +1,11 @@ import sys +import os import importlib + import medcat2 +import medcat2.storage.serialisers +import medcat2.utils.legacy.convert_cdb as convert_cdb +import medcat2.utils.legacy.convert_vocab as convert_vocab # Copy all attributes from medcat2 to this module for attr in dir(medcat2): @@ -39,3 +44,25 @@ def __getattr__(self, name): for module_name, replacement_module_name in manual_changes.items(): sys.modules[module_name] = SubmoduleProxy(replacement_module_name) # type: ignore + +# add automatic vocab / CDB conversion +_orig_deserialise = medcat2.storage.serialisers.deserialise + + +def deserialise_with_legacy_conversion( + folder_path: str, + ignore_folders_prefix: set[str] = set(), + ignore_folders_suffix: set[str] = set(), + **init_kwargs): + if not os.path.isdir(folder_path): + if folder_path.endswith("cdb.dat"): + print("Trying to legacy convert CDB from", folder_path) + return convert_cdb.get_cdb_from_old(folder_path) + elif folder_path.endswith("vocab.dat"): + print("Trying to legacy convert Vocab from", folder_path) + return convert_vocab.get_vocab_from_old(folder_path) + return _orig_deserialise( + folder_path, ignore_folders_prefix, ignore_folders_suffix, **init_kwargs) + + +medcat2.storage.serialisers.deserialise = deserialise_with_legacy_conversion From e9bd3fbfa8898b856d6223cba58d4f43e9930dc2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:40:23 +0100 Subject: [PATCH 24/79] CU-8699049kf: Treat saved paths as folders during tests (Vocab) --- .../1_create_model/create_vocab/test_create_vocab.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py b/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py index b2b358c..e7bed71 100644 --- a/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py +++ b/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py @@ -1,5 +1,6 @@ import os import sys +import shutil import medcat.vocab @@ -40,16 +41,19 @@ class CreateVocabTest(unittest.TestCase): def setUp(self) -> None: if os.path.exists(VOCAB_OUTPUT_PATH): - os.rename(VOCAB_OUTPUT_PATH, self.temp_vocab_path) + # NOTE: this is a folder in v2 + shutil.move(VOCAB_OUTPUT_PATH, self.temp_vocab_path) self.moved = True else: self.moved = False def tearDown(self) -> None: if os.path.exists(VOCAB_OUTPUT_PATH): - os.remove(VOCAB_OUTPUT_PATH) + # NOTE: this is a folder in v2 + shutil.rmtree(VOCAB_OUTPUT_PATH) if self.moved: - os.rename(self.temp_vocab_path, VOCAB_OUTPUT_PATH) + # NOTE: this is a folder in v2 + shutil.move(self.temp_vocab_path, VOCAB_OUTPUT_PATH) def test_creating_vocab(self): with patch('builtins.open', side_effect=custom_open): From a579a0b5d802c4cbe59985e64fe994b7465e1b6d Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:57:23 +0100 Subject: [PATCH 25/79] CU-8699049kf: Adapt CDB creation to v2 paths (config) and serialising --- medcat/1_create_model/create_cdb/create_cdb.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/medcat/1_create_model/create_cdb/create_cdb.py b/medcat/1_create_model/create_cdb/create_cdb.py index b163422..548da65 100644 --- a/medcat/1_create_model/create_cdb/create_cdb.py +++ b/medcat/1_create_model/create_cdb/create_cdb.py @@ -2,6 +2,7 @@ import pandas as pd from medcat.config import Config from medcat.cdb_maker import CDBMaker +from medcat.storage.serialisers import serialise, AvailableSerialisers pd.options.mode.chained_assignment = None # type: ignore @@ -24,6 +25,7 @@ model_dir = os.path.join(BASE_PATH, "models", "cdb") output_cdb = os.path.join(model_dir, f"{release}_SNOMED_cdb.dat") +os.mkdir(output_cdb) csv = pd.read_csv(csv_path) # Remove null values @@ -50,9 +52,9 @@ # Setup config config = Config() -config.general['spacy_model'] = 'en_core_web_md' -config.cdb_maker['remove_parenthesis'] = 1 -config.general['cdb_source_name'] = f'SNOMED_{release}' +config.general.nlp.modelname = 'en_core_web_md' +config.cdb_maker.remove_parenthesis = 1 +# config.general.cdb_source_name = f'SNOMED_{release}' maker = CDBMaker(config) @@ -64,8 +66,8 @@ # Add type_id pretty names to cdb cdb.addl_info['type_id2name'] = pd.Series(csv.description_type_ids.values, index=csv.type_ids.astype(str)).to_dict() -cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. +cdb.config.components.linking.filters.cuis = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. # save model -cdb.save(output_cdb) +serialise(AvailableSerialisers.dill, cdb, output_cdb) print(f"CDB Model saved successfully as: {output_cdb}") From 732163d00e4aef310005e52ad6c26b8e335b15d2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:57:41 +0100 Subject: [PATCH 26/79] CU-8699049kf: Adapt UMLS CDB creation to v2 paths (config) and serialising --- medcat/1_create_model/create_cdb/create_umls_cdb.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/medcat/1_create_model/create_cdb/create_umls_cdb.py b/medcat/1_create_model/create_cdb/create_umls_cdb.py index f692024..95b1722 100644 --- a/medcat/1_create_model/create_cdb/create_umls_cdb.py +++ b/medcat/1_create_model/create_cdb/create_umls_cdb.py @@ -2,6 +2,7 @@ import pandas as pd from medcat.config import Config from medcat.cdb_maker import CDBMaker +from medcat.storage.serialisers import serialise, AvailableSerialisers pd.options.mode.chained_assignment = None # type: ignore @@ -28,6 +29,7 @@ model_dir = os.path.join(BASE_PATH, "models", "cdb") output_cdb = os.path.join(model_dir, f"{release}_UMLS_cdb.dat") +os.mkdir(output_cdb) csv = pd.read_csv(csv_path) # Remove null values @@ -39,9 +41,9 @@ # Setup config config = Config() -config.general['spacy_model'] = 'en_core_web_md' -config.cdb_maker['remove_parenthesis'] = 1 -config.general['cdb_source_name'] = f'UMLS_{release}' +config.general.nlp.modelname = 'en_core_web_md' +config.cdb_maker.remove_parenthesis = 1 +# config.general.cdb_source_name = f'UMLS_{release}' maker = CDBMaker(config) @@ -52,8 +54,8 @@ cdb = maker.prepare_csvs(csv_paths, full_build=True) # Add type_id pretty names to cdb -cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. +cdb.config.components.linking.filters.cuis = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. # save model -cdb.save(output_cdb) +serialise(AvailableSerialisers.dill, cdb, output_cdb) print(f"CDB Model saved successfully as: {output_cdb}") From 4a6f50995ad516594be4fad426223c36da8f8e5d Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:58:12 +0100 Subject: [PATCH 27/79] CU-8699049kf: Adapt model pack creation to v2 paths (config) and serialising --- .../create_modelpack/create_modelpack.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/medcat/1_create_model/create_modelpack/create_modelpack.py b/medcat/1_create_model/create_modelpack/create_modelpack.py index d13cb76..da9b5eb 100644 --- a/medcat/1_create_model/create_modelpack/create_modelpack.py +++ b/medcat/1_create_model/create_modelpack/create_modelpack.py @@ -44,14 +44,14 @@ def load_cdb_and_save_modelpack(cdb_path: str, # Set cdb configuration # technically we already created this during the cdb creation - cdb.config.ner['min_name_len'] = 2 - cdb.config.ner['upper_case_limit_len'] = 3 - cdb.config.general['spell_check'] = True - cdb.config.linking['train_count_threshold'] = 10 - cdb.config.linking['similarity_threshold'] = 0.3 - cdb.config.linking['train'] = True - cdb.config.linking['disamb_length_limit'] = 4 - cdb.config.general['full_unlink'] = True + cdb.config.components.ner.min_name_len = 2 + cdb.config.components.ner.upper_case_limit_len = 3 + cdb.config.general.spell_check = True + cdb.config.components.linking.train_count_threshold = 10 + cdb.config.components.linking.similarity_threshold = 0.3 + cdb.config.components.linking.train = True + cdb.config.components.linking.disamb_length_limit = 4 + cdb.config.general.full_unlink = True # Load vocab vocab: Vocab = deserialise(vocab_path) @@ -60,7 +60,7 @@ def load_cdb_and_save_modelpack(cdb_path: str, cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) # Create and save model pack - return cat.create_model_pack(save_dir_path=modelpack_path, model_pack_name=modelpack_name) + return cat.save_model_pack(modelpack_path, pack_name=modelpack_name) def load_cdb_and_save_modelpack_in_def_location(cdb_name: str, From 2ade2fdf2e4d4ea8acccb29a1d4f44681d700624 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:58:46 +0100 Subject: [PATCH 28/79] CU-8699049kf: Adapt Vocab test-time serialising to v2 methods --- tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py b/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py index e7bed71..7fea449 100644 --- a/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py +++ b/tests/tmedcat/1_create_model/create_vocab/test_create_vocab.py @@ -3,6 +3,7 @@ import shutil import medcat.vocab +from medcat.storage.serialisers import deserialise _FILE_DIR = os.path.dirname(__file__) @@ -61,5 +62,5 @@ def test_creating_vocab(self): vocab_path = os.path.join(create_vocab.vocab_dir, "vocab.dat") self.assertEqual(os.path.abspath(vocab_path), VOCAB_OUTPUT_PATH) self.assertTrue(os.path.exists(vocab_path)) - vocab = medcat.vocab.Vocab.load(vocab_path) + vocab: medcat.vocab.Vocab = deserialise(vocab_path) self.assertIsInstance(vocab, medcat.vocab.Vocab) From fb0b5ac8f2bb52b690e8b59086c8ff18c080a110 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 13:59:16 +0100 Subject: [PATCH 29/79] CU-8699049kf: Adap model pack creation tests to v2 standards (basename vs full path) --- .../1_create_model/create_modelpack/test_create_modelpack.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py b/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py index 7a25204..89dae1a 100644 --- a/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py +++ b/tests/tmedcat/1_create_model/create_modelpack/test_create_modelpack.py @@ -39,9 +39,10 @@ def tearDownClass(cls): cls.tempfolder.cleanup() def test_a(self): - model_pack_name = create_modelpack.load_cdb_and_save_modelpack( + model_pack_name_full = create_modelpack.load_cdb_and_save_modelpack( DEFAULT_CDB_PATH, self.model_pack_name, self.tempfolder.name, DEFAULT_VOCAB_PATH) + model_pack_name = os.path.basename(model_pack_name_full) self.assertTrue(model_pack_name.startswith(self.model_pack_name)) model_pack_path = os.path.join(self.tempfolder.name, model_pack_name) self.assertTrue(os.path.exists(model_pack_path)) From 23a19e4f76a38f7e41b5a024da10b0731288f0ba Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 14:02:47 +0100 Subject: [PATCH 30/79] CU-8699049kf: Allow overwriting existing models for test-time purposes --- medcat/1_create_model/create_cdb/create_cdb.py | 7 +++++-- medcat/1_create_model/create_cdb/create_umls_cdb.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/medcat/1_create_model/create_cdb/create_cdb.py b/medcat/1_create_model/create_cdb/create_cdb.py index 548da65..e761e19 100644 --- a/medcat/1_create_model/create_cdb/create_cdb.py +++ b/medcat/1_create_model/create_cdb/create_cdb.py @@ -25,7 +25,10 @@ model_dir = os.path.join(BASE_PATH, "models", "cdb") output_cdb = os.path.join(model_dir, f"{release}_SNOMED_cdb.dat") -os.mkdir(output_cdb) +os.makedirs(output_cdb, exist_ok=True) +# NOTE: by default, new models creaeted at the same location will not be saved +# so here we allow overwrtiing +allow_overwrite = True csv = pd.read_csv(csv_path) # Remove null values @@ -69,5 +72,5 @@ cdb.config.components.linking.filters.cuis = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. # save model -serialise(AvailableSerialisers.dill, cdb, output_cdb) +serialise(AvailableSerialisers.dill, cdb, output_cdb, overwrite=allow_overwrite) print(f"CDB Model saved successfully as: {output_cdb}") diff --git a/medcat/1_create_model/create_cdb/create_umls_cdb.py b/medcat/1_create_model/create_cdb/create_umls_cdb.py index 95b1722..939a1c4 100644 --- a/medcat/1_create_model/create_cdb/create_umls_cdb.py +++ b/medcat/1_create_model/create_cdb/create_umls_cdb.py @@ -29,7 +29,10 @@ model_dir = os.path.join(BASE_PATH, "models", "cdb") output_cdb = os.path.join(model_dir, f"{release}_UMLS_cdb.dat") -os.mkdir(output_cdb) +os.makedirs(output_cdb, exist_ok=True) +# NOTE: by default, new models creaeted at the same location will not be saved +# so here we allow overwrtiing +allow_overwrite = True csv = pd.read_csv(csv_path) # Remove null values @@ -57,5 +60,5 @@ cdb.config.components.linking.filters.cuis = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. # save model -serialise(AvailableSerialisers.dill, cdb, output_cdb) +serialise(AvailableSerialisers.dill, cdb, output_cdb, overwrite=allow_overwrite) print(f"CDB Model saved successfully as: {output_cdb}") From ef7c6f77551f6b7b68a21e910945285bee72310a Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 14:05:05 +0100 Subject: [PATCH 31/79] CU-8699049kf: Move to v2-type (de)serialising in CDB creation tests (along with cleanup) --- tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py b/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py index 911cccd..b115f52 100644 --- a/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py +++ b/tests/tmedcat/1_create_model/create_cdb/test_create_cdb.py @@ -1,6 +1,9 @@ import os import sys +import shutil + import medcat.cdb +from medcat.storage.serialisers import deserialise _FILE_DIR = os.path.dirname(__file__) @@ -35,12 +38,12 @@ def setUp(self) -> None: def tearDown(self) -> None: if self.output_cdb is not None and os.path.exists(self.output_cdb): - os.remove(self.output_cdb) + shutil.rmtree(self.output_cdb) def assertHasCDB(self, path: str): self.assertTrue(os.path.exists(path)) self.assertTrue(path.endswith(".dat")) - cdb = medcat.cdb.CDB.load(path) + cdb: CDB = deserialise(path) self.assertIsInstance(cdb, medcat.cdb.CDB) def test_snomed_cdb_creation(self): From a333084c0c648e527badf08e08c07326b656893b Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 14:16:45 +0100 Subject: [PATCH 32/79] CU-8699049kf: Fix test runner path in workflow --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4b6c311..374fc13 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,8 +34,8 @@ jobs: python -m mypy `git ls-tree --full-tree --name-only -r HEAD | grep ".py$" | grep -v "tests/"` --explicit-package-bases --follow-imports=normal - name: Test run: | - python tests/custom_test_runner.py - python tests/custom_test_runner.py -s medcat/compare_models + python tests/runner/custom_test_runner.py + python tests/runner/custom_test_runner.py -s medcat/compare_models # TODO - in the future, we might want to add automated tests for notebooks as well # though it's not really possible right now since the notebooks are designed # in a way that assumes interaction (i.e specifying model pack names) From f1249fc8c4f62d224f819e7f75956f387352cf4b Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 14:28:39 +0100 Subject: [PATCH 33/79] CU-8699049kf: Move model compare closer to v2 format --- medcat/compare_models/compare.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/medcat/compare_models/compare.py b/medcat/compare_models/compare.py index 5bdce6e..06f3a73 100644 --- a/medcat/compare_models/compare.py +++ b/medcat/compare_models/compare.py @@ -1,6 +1,7 @@ -from typing import List, Tuple, Dict, Set, Optional, Union, Iterator +from typing import Tuple, Dict, Set, Optional, Union, Iterator from functools import partial import glob +import json from medcat.cat import CAT @@ -34,11 +35,9 @@ def do_counting(cat1: CAT, cat2: CAT, ann_diffs: PerAnnotationDifferences, doc_limit: int = -1) -> ResultsTally: def cui2name(cat, cui): - if cui in cat.cdb.cui2preferred_name: - return cat.cdb.cui2preferred_name[cui] - all_names = cat.cdb.cui2names[cui] - # longest anme - return sorted(all_names, key=lambda name: len(name), reverse=True)[0] + ci = cat.cdb.cui2info[cui] + # longest name + return ci['preferred_name'] or sorted(ci['names'], key=lambda name: len(name), reverse=True)[0] res1 = ResultsTally(pt2ch=_get_pt2ch(cat1), cat_data=cat1.cdb.make_stats(), cui2name=partial(cui2name, cat1)) res2 = ResultsTally(pt2ch=_get_pt2ch(cat2), cat_data=cat2.cdb.make_stats(), @@ -99,10 +98,14 @@ def load_and_train(model_pack_path: str, mct_export_path: str) -> CAT: # NOTE: Allowing mct_export_path to contain wildcat ("*"). # And in such a case, iterating over all matching files if "*" not in mct_export_path: - cat.train_supervised_from_json(mct_export_path) + with open(mct_export_path) as f: + mct_export = json.load(f) + cat.trainer.train_supervised_raw(mct_export) else: for file in glob.glob(mct_export_path): - cat.train_supervised_from_json(file) + with open(file) as f: + mct_export = json.load(f) + cat.trainer.train_supervised_raw(mct_export) return cat From 5677fb25ead63cd4af9386e3bd748d6632686a0b Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 14:34:22 +0100 Subject: [PATCH 34/79] CU-8699049kf: Fix typing for ResultsTally model --- medcat/compare_models/compare_annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/compare_models/compare_annotations.py b/medcat/compare_models/compare_annotations.py index 768bbb6..af7e252 100644 --- a/medcat/compare_models/compare_annotations.py +++ b/medcat/compare_models/compare_annotations.py @@ -14,7 +14,7 @@ class ResultsTally(BaseModel): pt2ch: Optional[Dict[str, Set[str]]] cat_data: dict cui2name: Callable[[str], str] - total_count = 0 + total_count: int = 0 per_cui_count: Dict[str, int] = {} per_cui_acc: Dict[str, float] = {} per_cui_forms: Dict[str, Set[str]] = {} From 0e6cef959b1cdb4625f0707e8ae96350c8835b2f Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 14:35:01 +0100 Subject: [PATCH 35/79] CU-8699049kf: Fix test-time mocked method for training --- medcat/compare_models/tests/test_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/compare_models/tests/test_compare.py b/medcat/compare_models/tests/test_compare.py index 79f53e9..b359fb2 100644 --- a/medcat/compare_models/tests/test_compare.py +++ b/medcat/compare_models/tests/test_compare.py @@ -77,7 +77,7 @@ class TrainAndCompareTests(unittest.TestCase): # this tests that the training is called @classmethod - @unittest.mock.patch("medcat.cat.CAT.train_supervised_from_json") + @unittest.mock.patch("medcat.trainer.Trainer.train_supervised_raw") def _get_diffs(cls, mct_export_path: str, method): diffs = get_diffs_for(cls.cat_path, mct_export_path, cls.docs_file, supervised_train_comparison_model=True) From 53ed1b4b1d8891fd37d1d8b27d9d7f950fc24a7f Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 14:35:33 +0100 Subject: [PATCH 36/79] CU-8699049kf: Fix missing keyword argument when initialising results during tests --- medcat/compare_models/tests/test_compare_annotations.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/medcat/compare_models/tests/test_compare_annotations.py b/medcat/compare_models/tests/test_compare_annotations.py index 55f19e1..5fdf747 100644 --- a/medcat/compare_models/tests/test_compare_annotations.py +++ b/medcat/compare_models/tests/test_compare_annotations.py @@ -38,8 +38,9 @@ def _cui2name(self, cui: str) -> str: return self.cui2name[cui] def setUp(self) -> None: - self.res = compare_annotations.ResultsTally(cat_data={"stats": "don't matter"}, - cui2name=self._cui2name) + self.res = compare_annotations.ResultsTally( + pt2ch=None, cat_data={"stats": "don't matter"}, + cui2name=self._cui2name) for entities in self.entities: self.res.count(entities['entities']) From b26fbef1f0b2a006e6ce1d385d8931c56677e92f Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 15:10:51 +0100 Subject: [PATCH 37/79] CU-8699049kf: Fix v2-specific cui set in comparison --- medcat/compare_models/compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/compare_models/compare.py b/medcat/compare_models/compare.py index 06f3a73..4ce7728 100644 --- a/medcat/compare_models/compare.py +++ b/medcat/compare_models/compare.py @@ -64,8 +64,8 @@ def get_per_annotation_diffs(cat1: CAT, cat2: CAT, documents: Iterator[Tuple[str save_opts = SaveOptions(use_db=True, db_file_name=temp_file.name, clean_callback=temp_file.close) pad = PerAnnotationDifferences(pt2ch1=pt2ch1, pt2ch2=pt2ch2, - model1_cuis=set(cat1.cdb.cui2names), - model2_cuis=set(cat2.cdb.cui2names), + model1_cuis=set(cat1.cdb.cui2info), + model2_cuis=set(cat2.cdb.cui2info), keep_raw=keep_raw, save_options=save_opts) total = doc_limit if doc_limit != -1 else None From 439d58d6d9bd231567eafaeb763eee278d45dce4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 15:13:18 +0100 Subject: [PATCH 38/79] CU-8699049kf: Fix v2-specific CDB stats/info in comparison --- medcat/compare_models/compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/compare_models/compare.py b/medcat/compare_models/compare.py index 4ce7728..e6ad450 100644 --- a/medcat/compare_models/compare.py +++ b/medcat/compare_models/compare.py @@ -38,9 +38,9 @@ def cui2name(cat, cui): ci = cat.cdb.cui2info[cui] # longest name return ci['preferred_name'] or sorted(ci['names'], key=lambda name: len(name), reverse=True)[0] - res1 = ResultsTally(pt2ch=_get_pt2ch(cat1), cat_data=cat1.cdb.make_stats(), + res1 = ResultsTally(pt2ch=_get_pt2ch(cat1), cat_data=cat1.cdb.get_basic_info(), cui2name=partial(cui2name, cat1)) - res2 = ResultsTally(pt2ch=_get_pt2ch(cat2), cat_data=cat2.cdb.make_stats(), + res2 = ResultsTally(pt2ch=_get_pt2ch(cat2), cat_data=cat2.cdb.get_basic_info(), cui2name=partial(cui2name, cat2)) total = doc_limit if doc_limit != -1 else None for per_doc in tqdm.tqdm(ann_diffs.per_doc_results.values(), total=total): From da791507225cbc445302b9c7695cf0bcc9f970eb Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 15:17:27 +0100 Subject: [PATCH 39/79] CU-8699049kf: Use v2-specific version fo CDB comparison --- medcat/compare_models/compare_cdb.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/medcat/compare_models/compare_cdb.py b/medcat/compare_models/compare_cdb.py index 5f99574..bdaf890 100644 --- a/medcat/compare_models/compare_cdb.py +++ b/medcat/compare_models/compare_cdb.py @@ -1,6 +1,7 @@ from typing import Dict, Set, Tuple from medcat.cdb import CDB +from medcat.cdb.concepts import CUIInfo import tqdm from itertools import chain @@ -96,7 +97,7 @@ class DictComparisonResults(BaseModel): values: DictCompareValues @classmethod - def get(cls, d1: dict, d2: dict, progress: bool = True) -> "DictComparisonResults": + def get(cls, d1: dict[str, CUIInfo], d2: dict[str, CUIInfo], progress: bool = True) -> "DictComparisonResults": return cls(keys=DictCompareKeys.get(d1, d2), values=DictCompareValues.get(d1, d2, progress=progress)) @@ -119,6 +120,6 @@ def compare(cdb1: CDB, Returns: CDBCompareResults: _description_ """ - reg = DictComparisonResults.get(cdb1.cui2names, cdb2.cui2names, progress=show_progress) - snames = DictComparisonResults.get(cdb1.cui2snames, cdb2.cui2snames, progress=show_progress) + reg = DictComparisonResults.get(cdb1.cui2info, cdb2.cui2info, progress=show_progress) + snames = DictComparisonResults.get(cdb1.cui2info, cdb2.cui2info, progress=show_progress) return CDBCompareResults(names=reg, snames=snames) From 54fa6c70615aaed8c5061ea61f8522fade8c795f Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 15:42:33 +0100 Subject: [PATCH 40/79] CU-8699049kf: Update dependency to v0.3.4 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index eb25b48..920a672 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat2[meta-cat,spacy,deid] @ git+https://github.com/CogStack/MedCAT2@v0.3.3 +medcat2[meta-cat,spacy,deid] @ git+https://github.com/CogStack/MedCAT2@v0.3.4 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From 01c8e1af8c17a13690720a91122c837aa214fd25 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 15:50:18 +0100 Subject: [PATCH 41/79] CU-8699049kf: Add message to problematic assert call in tests --- medcat/compare_models/tests/test_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/compare_models/tests/test_compare.py b/medcat/compare_models/tests/test_compare.py index b359fb2..fc72080 100644 --- a/medcat/compare_models/tests/test_compare.py +++ b/medcat/compare_models/tests/test_compare.py @@ -81,7 +81,7 @@ class TrainAndCompareTests(unittest.TestCase): def _get_diffs(cls, mct_export_path: str, method): diffs = get_diffs_for(cls.cat_path, mct_export_path, cls.docs_file, supervised_train_comparison_model=True) - cls.assertTrue(cls, method.called) + cls.assertTrue(cls, method.called, "Expected method to be called") return diffs From 9adf59d5820a85a947282c21d416473b1889e021 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 15:57:38 +0100 Subject: [PATCH 42/79] CU-8699049kf: Change way of asserting mock method call --- medcat/compare_models/tests/test_compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/compare_models/tests/test_compare.py b/medcat/compare_models/tests/test_compare.py index fc72080..30c051b 100644 --- a/medcat/compare_models/tests/test_compare.py +++ b/medcat/compare_models/tests/test_compare.py @@ -78,10 +78,10 @@ class TrainAndCompareTests(unittest.TestCase): # this tests that the training is called @classmethod @unittest.mock.patch("medcat.trainer.Trainer.train_supervised_raw") - def _get_diffs(cls, mct_export_path: str, method): + def _get_diffs(cls, mct_export_path: str, method: unittest.mock.MagicMock): diffs = get_diffs_for(cls.cat_path, mct_export_path, cls.docs_file, supervised_train_comparison_model=True) - cls.assertTrue(cls, method.called, "Expected method to be called") + method.assert_called() return diffs From 38b61f2df855f4f7f52946181c61f965c04ff8ff Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 14 May 2025 17:08:49 +0100 Subject: [PATCH 43/79] CU-8699049kf: Patch the train method fo instance as well --- medcat/compare_models/tests/test_compare.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/medcat/compare_models/tests/test_compare.py b/medcat/compare_models/tests/test_compare.py index 30c051b..4a9f975 100644 --- a/medcat/compare_models/tests/test_compare.py +++ b/medcat/compare_models/tests/test_compare.py @@ -79,6 +79,12 @@ class TrainAndCompareTests(unittest.TestCase): @classmethod @unittest.mock.patch("medcat.trainer.Trainer.train_supervised_raw") def _get_diffs(cls, mct_export_path: str, method: unittest.mock.MagicMock): + orig_load_method = CAT.load_model_pack + def _wrapped_load_method(*args, **kwargs): + cat = orig_load_method(*args, **kwargs) + cat.trainer.train_supervised_raw = method + return cat + CAT.load_model_pack = _wrapped_load_method diffs = get_diffs_for(cls.cat_path, mct_export_path, cls.docs_file, supervised_train_comparison_model=True) method.assert_called() From 5fd419fc3ee7529eb514f7c6405b67cf9aad5257 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 11:21:06 +0100 Subject: [PATCH 44/79] CU-8699049kf: Update unsupervised training script to v2 --- .../unsupervised training.ipynb | 29 +++++-------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb b/medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb index 77cd181..44f9771 100644 --- a/medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb +++ b/medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "cat.cdb.print_stats()" + "cat.cdb.get_basic_info()" ] }, { @@ -88,21 +88,12 @@ "outputs": [], "source": [ "# Print statistics on the CDB before training\n", - "cat.cdb.print_stats()\n", + "cat.cdb.get_basic_info()\n", "\n", "# Run the annotation procedure over all the documents we have,\n", "# given that we have a large number of documents this can take quite some time.\n", "\n", - "for i, text in enumerate(data['text'].values):\n", - " # This will now run the training in the background \n", - " try:\n", - " _ = cat(text, do_train=True)\n", - " except TypeError:\n", - " pass\n", - " \n", - " # So we know how things are moving\n", - " if i % 10000 == 0:\n", - " print(\"Finished {} - text blocks\".format(i))\n" + "cat.trainer.train_unsupervised(data.text)\n" ] }, { @@ -112,7 +103,7 @@ "outputs": [], "source": [ "# Print statistics on the CDB after training\n", - "cat.cdb.print_stats()" + "cat.cdb.get_basic_info()" ] }, { @@ -122,7 +113,8 @@ "outputs": [], "source": [ "# save modelpack\n", - "cat.create_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack)\n" + "\n", + "cat.save_model_pack(model_dir, pack_name=output_modelpack)\n" ] }, { @@ -135,7 +127,7 @@ ], "metadata": { "kernelspec": { - "display_name": "medcat", + "display_name": "venv_v2", "language": "python", "name": "python3" }, @@ -149,12 +141,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8 (main, Nov 24 2022, 08:08:27) [Clang 14.0.6 ]" - }, - "vscode": { - "interpreter": { - "hash": "4e4ccc64ca47f932c34194843713e175cf3a19af3798844e4190152d16ba61ca" - } + "version": "3.10.13" } }, "nbformat": 4, From c4fe9b8b3db345c04c3c578676e36280e0cdd033 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 11:40:47 +0100 Subject: [PATCH 45/79] CU-8699049kf: Update supervised training script to v2 --- .../supervised training.ipynb | 47 ++++++++----------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/medcat/2_train_model/2_supervised_training/supervised training.ipynb b/medcat/2_train_model/2_supervised_training/supervised training.ipynb index 3d634ca..2a1ac10 100644 --- a/medcat/2_train_model/2_supervised_training/supervised training.ipynb +++ b/medcat/2_train_model/2_supervised_training/supervised training.ipynb @@ -10,7 +10,8 @@ "import json\n", "import pandas as pd\n", "from datetime import date\n", - "from medcat.cat import CAT" + "from medcat.cat import CAT\n", + "from medcat.stats.stats import get_stats" ] }, { @@ -52,7 +53,7 @@ "source": [ "# Create CAT - the main class from medcat used for concept annotation\n", "cat = CAT.load_model_pack(model_pack_path)\n", - "cat.config.linking['filters'] = {'cuis':set()} # To remove exisitng filters" + "cat.config.components.linking.filters.cuis = set() # To remove exisitng filters" ] }, { @@ -74,7 +75,7 @@ "if snomed_filter_path:\n", " snomed_filter = set(json.load(open(snomed_filter_path)))\n", "else:\n", - " snomed_filter = set(cat.cdb.cui2preferred_name.keys())\n" + " snomed_filter = set(cat.cdb.cui2info.keys())\n" ] }, { @@ -90,13 +91,17 @@ "metadata": {}, "outputs": [], "source": [ - "cat.train_supervised_from_json(data_path=mctrainer_export_path, \n", - " nepochs=3,\n", - " reset_cui_count=False,\n", - " print_stats=True,\n", - " use_filters=True,\n", - " extra_cui_filter=snomed_filter, # If not filter is set remove this line\n", - " )\n" + "import json\n", + "with open(mctrainer_export_path) as f:\n", + " data = json.load(f)\n", + "cat.trainer.train_supervised_raw(\n", + " data=data, \n", + " nepochs=3,\n", + " reset_cui_count=False,\n", + " print_stats=True,\n", + " use_filters=True,\n", + " extra_cui_filter=snomed_filter, # If not filter is set remove this line\n", + ")\n" ] }, { @@ -106,15 +111,6 @@ "# Stats" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = json.load(open(mctrainer_export_path))" - ] - }, { "cell_type": "code", "execution_count": null, @@ -123,7 +119,7 @@ }, "outputs": [], "source": [ - "fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = cat._print_stats(data, use_filters=True)" + "fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = get_stats(cat, data, use_project_filters=True)" ] }, { @@ -159,7 +155,7 @@ "outputs": [], "source": [ "# save modelpack\n", - "cat.create_model_pack(os.path.join(model_dir, output_modelpack))" + "cat.save_model_pack(os.path.join(model_dir, output_modelpack))" ] }, { @@ -198,7 +194,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv_v2", "language": "python", "name": "python3" }, @@ -212,12 +208,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6 (default, Sep 26 2022, 11:37:49) \n[Clang 14.0.0 (clang-1400.0.29.202)]" - }, - "vscode": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - } + "version": "3.10.13" } }, "nbformat": 4, From b4434a4a7c9df79180968e971e4742503130403f Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 14:54:53 +0100 Subject: [PATCH 46/79] CU-8699049kf: Update MetaCAT notebook to v2 --- .../meta_annotation_training.ipynb | 93 ++++++++++++++----- 1 file changed, 70 insertions(+), 23 deletions(-) diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb index 46bbca8..e54254d 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d58c720d", "metadata": {}, "outputs": [], @@ -11,15 +11,19 @@ "import os\n", "from datetime import date\n", "from medcat.cat import CAT\n", - "from medcat.meta_cat import MetaCAT\n", - "from medcat.config_meta_cat import ConfigMetaCAT\n", - "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n", + "from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon\n", + "from medcat.config.config_meta_cat import ConfigMetaCAT\n", + "from medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer import TokenizerWrapperBPE\n", + "from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import TokenizerWrapperBERT\n", + "from medcat.utils.legacy.identifier import is_legacy_model_pack\n", + "from medcat.storage.serialisers import deserialise\n", + "from medcat.tokenizing.tokenizers import create_tokenizer\n", "from tokenizers import ByteLevelBPETokenizer" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "ca80af0e", "metadata": {}, "outputs": [], @@ -39,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "dd7a2e97", "metadata": {}, "outputs": [], @@ -71,10 +75,19 @@ "\n", "# Iterate through the meta_models contained in the model\n", "meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n", + "model_is_legacy = is_legacy_model_pack(base_dir_meta_models)\n", + "if model_is_legacy:\n", + " # NOTE: when loaded, will be auto-converted\n", + " exp_start = \"meta_\"\n", + " config_path = [\"config.json\"]\n", + "else:\n", + " exp_start = \"addon_meta_cat\"\n", + " base_dir_meta_models = os.path.join(base_dir_meta_models, \"saved_components\")\n", + " config_path = [\"meta_cat\", \"config\"]\n", "for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n", " for dirname in dirnames:\n", - " if dirname.startswith('meta_'):\n", - " meta_model_names.append(dirname[5:])" + " if dirname.startswith(exp_start):\n", + " meta_model_names.append(dirname[len(exp_start):])" ] }, { @@ -103,10 +116,14 @@ "outputs": [], "source": [ "for meta_model in meta_model_names:\n", - " config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n", - " with open(config_file, 'r') as jfile:\n", - " config_dict = json.load(jfile)\n", - " print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])" + " config_path = os.path.join(base_dir_meta_models, exp_start + meta_model, *config_path)\n", + " if model_is_legacy:\n", + " with open(config_path, 'r') as jfile:\n", + " config_dict = json.load(jfile)\n", + " print(f\"Model used for meta_{meta_model}:\", config_dict['model']['model_name'])\n", + " else:\n", + " cnf: ConfigMetaCAT = deserialise(config_path)\n", + " print(f\"Model used for meta_{meta_model}:\", config_dict.model.model_name)" ] }, { @@ -124,13 +141,34 @@ "metadata": {}, "outputs": [], "source": [ + "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", + "# to the relevant Entity/Span and Document implementation\n", + "# we'll use the regex tokenizer here for example since it's easier to initialise\n", + "# but you can use a spacy-based one, you just need to also pass:\n", + "# - the model name (e.g 'en_core_web_md')\n", + "# - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n", + "# 'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n", + "# - whether diacritics should be used\n", + "# - max document length (e.g 1_000_000)\n", + "base_tokenizer = create_tokenizer(\"regex\")\n", "for meta_model in meta_model_names:\n", - " \n", - " # load the meta_model\n", - " mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model))\n", + " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", + " if model_is_legacy:\n", + " from medcat.utils.legacy.convert_meta_cat import get_meta_cat_from_old\n", + " meta_cat: MetaCATAddon = get_meta_cat_from_old(meta_cat_path, base_tokenizer)\n", + " else:\n", + " # NOTE: the expected workflow when loading the model\n", + " # is one where the config is stored as part of the overall config\n", + " # and thus using it for loading is trivial\n", + " # but here we need to manually load the config from disk\n", + " cnf_path = os.path.join(meta_cat_path, \"config\")\n", + " cnf: ConfigMetaCAT = deserialise(cnf_path)\n", + " # load the meta_model\n", + " meta_cat = MetaCATAddon.load_existing(cnf, base_tokenizer, os.path.join(base_dir_meta_models, exp_start + meta_model))\n", + " mc = meta_cat.mc\n", "\n", " # changing parameters\n", - " mc.config.train['nepochs'] = 15\n", + " mc.config.train.nepochs = 15\n", "\n", " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n", " #Ideally this should replace the meta_models inside the modelpack\n", @@ -163,16 +201,25 @@ "# config.general['category_name']\n", "\n", "# change model name if training BERT for the first time\n", - "config.model['model_name'] = 'bert'\n", - "\n", - "tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n", + "config.model.model_name = 'bert'\n", "\n", "save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n", "#Ideally this should replace the meta_models inside the modelpack\n", "\n", + "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", + "# to the relevant Entity/Span and Document implementation\n", + "# we'll use the regex tokenizer here for example since it's easier to initialise\n", + "# but you can use a spacy-based one, you just need to also pass:\n", + "# - the model name (e.g 'en_core_web_md')\n", + "# - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n", + "# 'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n", + "# - whether diacritics should be used\n", + "# - max document length (e.g 1_000_000)\n", + "base_tokenizer = create_tokenizer(\"regex\")\n", + "\n", "# Initialise and train meta_model\n", - "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n", - "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", + "mc = MetaCATAddon.create_new(config, base_tokenizer)\n", + "results = mc.mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", "\n", "# Save results\n", "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" @@ -181,7 +228,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv_v2", "language": "python", "name": "python3" }, @@ -195,7 +242,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.10.13" } }, "nbformat": 4, From 40fc8387b5a238033c648ac854136f63999d175d Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 15:13:13 +0100 Subject: [PATCH 47/79] CU-8699049kf: Update 2-phase learning MetaCAT notebook to v2 to the best of my ability --- .../meta_annotation_training_advanced.ipynb | 136 +++++++++++++----- 1 file changed, 101 insertions(+), 35 deletions(-) diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb index 7266056..c9abe82 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d58c720d", "metadata": {}, "outputs": [], @@ -19,9 +19,13 @@ "import os\n", "from datetime import date\n", "from medcat.cat import CAT\n", - "from medcat.meta_cat import MetaCAT\n", - "from medcat.config_meta_cat import ConfigMetaCAT\n", - "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n", + "from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon, MetaCAT\n", + "from medcat.config.config_meta_cat import ConfigMetaCAT\n", + "from medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer import TokenizerWrapperBPE\n", + "from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import TokenizerWrapperBERT\n", + "from medcat.utils.legacy.identifier import is_legacy_model_pack\n", + "from medcat.storage.serialisers import deserialise\n", + "from medcat.tokenizing.tokenizers import create_tokenizer\n", "from tokenizers import ByteLevelBPETokenizer" ] }, @@ -48,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "dd7a2e97", "metadata": {}, "outputs": [], @@ -80,10 +84,46 @@ "\n", "# Iterate through the meta_models contained in the model\n", "meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n", + "model_is_legacy = is_legacy_model_pack(base_dir_meta_models)\n", + "if model_is_legacy:\n", + " # NOTE: when loaded, will be auto-converted\n", + " exp_start = \"meta_\"\n", + " config_path = [\"config.json\"]\n", + "else:\n", + " exp_start = \"addon_meta_cat\"\n", + " base_dir_meta_models = os.path.join(base_dir_meta_models, \"saved_components\")\n", + " config_path = [\"meta_cat\", \"config\"]\n", "for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n", " for dirname in dirnames:\n", - " if dirname.startswith('meta_'):\n", - " meta_model_names.append(dirname[5:])" + " if dirname.startswith(exp_start):\n", + " meta_model_names.append(dirname[len(exp_start):])" + ] + }, + { + "cell_type": "markdown", + "id": "0b763d35", + "metadata": {}, + "source": [ + "Run this before continuing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08f8d879", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", + "# to the relevant Entity/Span and Document implementation\n", + "# we'll use the regex tokenizer here for example since it's easier to initialise\n", + "# but you can use a spacy-based one, you just need to also pass:\n", + "# - the model name (e.g 'en_core_web_md')\n", + "# - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n", + "# 'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n", + "# - whether diacritics should be used\n", + "# - max document length (e.g 1_000_000)\n", + "base_tokenizer = create_tokenizer(\"regex\")" ] }, { @@ -115,19 +155,20 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "id": "dc91f7d6", "metadata": {}, + "outputs": [], "source": [ - "\n", - "#option 1
\n", - "mc.config.train['class_weights'] = []
\n", - "mc.config.train['compute_class_weights'] = True
\n", - "#NOTE: this will only be applicable if mc.config.train.class_weights is empty
\n", - "
\n", - "#2nd option
\n", - "#using specified class weights
\n", - "mc.config.train['class_weights'] = [0.4,0.3,0.1]
" + "mc: MetaCAT\n", + "#option 1\n", + "mc.config.train['class_weights'] = []\n", + "mc.config.train['compute_class_weights'] = True\n", + "#NOTE: this will only be applicable if mc.config.train.class_weights is empty\n", + "#2nd option\n", + "#using specified class weights\n", + "mc.config.train['class_weights'] = [0.4,0.3,0.1]" ] }, { @@ -135,7 +176,7 @@ "id": "c217762f", "metadata": {}, "source": [ - "NOTE: Make sure to correctly map the class weights to their corresponding class index (ID).
To check the index assigned to the classes, use:
`print(mc.config.general['category_value2id'])`\n", + "NOTE: Make sure to correctly map the class weights to their corresponding class index (ID).
To check the index assigned to the classes, use:
`print(mc.config.general.category_value2id)`\n", "
This will print a dictionary where the class names and their corresponding IDs (indices) are displayed.
\n", "The first position in the class weight list corresponds to the class with ID 0 in the dictionary, and so on." ] @@ -171,19 +212,40 @@ "metadata": {}, "outputs": [], "source": [ + "def load_meta_cat_from_file(meta_cat_path: str, config_name: str = 'config.json') -> MetaCATAddon:\n", + " config_path = os.path.join(meta_cat_path, config_name)\n", + " with open(config_path) as f:\n", + " config_dict = json.load(f)\n", + "\n", + " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", + " if model_is_legacy:\n", + " from medcat.utils.legacy.convert_meta_cat import get_meta_cat_from_old\n", + " meta_cat: MetaCATAddon = get_meta_cat_from_old(meta_cat_path, base_tokenizer)\n", + " else:\n", + " # NOTE: the expected workflow when loading the model\n", + " # is one where the config is stored as part of the overall config\n", + " # and thus using it for loading is trivial\n", + " # but here we need to manually load the config from disk\n", + " cnf_path = os.path.join(meta_cat_path, \"config\")\n", + " cnf: ConfigMetaCAT = deserialise(cnf_path)\n", + " # load the meta_model\n", + " meta_cat = MetaCATAddon.load_existing(cnf, base_tokenizer, os.path.join(base_dir_meta_models, exp_start + meta_model))\n", + " return meta_cat\n", "#--------------------------------Phase 1--------------------------------\n", - "def run_phase_1(meta_model,class_wt_phase1 = None):\n", + "def run_phase_1(meta_model, class_wt_phase1 = None):\n", " #Loading the pre-defined config for phase 1\n", - " config_ph_1_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph1.json\")\n", - " with open(config_ph_1_path) as f:\n", - " config_ph1 = json.load(f)\n", - "\n", - " mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model),config_dict = config_ph1)\n", + " # NOTE: the original (v1) version contained loading a different config\n", + " # for each phase, but I do not know how these files would have been saved there\n", + " # and thus don't know what the indent was\n", + " # config_ph_1_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph1.json\")\n", + " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", + " meta_cat = load_meta_cat_from_file(meta_cat_path)\n", + " mc = meta_cat.mc\n", "\n", " if class_wt_phase1:\n", - " mc.config.train['class_weights'] = class_wt_phase1\n", + " mc.config.train.class_weights = class_wt_phase1\n", "\n", - " mc.config.train['nepochs'] = 30 #You can change the number of epochs, remember to keep them higher for phase 1\n", + " mc.config.train.nepochs = 30 #You can change the number of epochs, remember to keep them higher for phase 1\n", "\n", " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n", " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", @@ -191,18 +253,20 @@ " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n", "\n", "#--------------------------------Phase 2--------------------------------\n", - "def run_phase_2(meta_model,class_wt_phase2 = None): \n", + "def run_phase_2(meta_model, class_wt_phase2 = None): \n", " #Loading the pre-defined config for phase 2\n", - " config_ph_2_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph2.json\")\n", - " with open(config_ph_2_path) as f:\n", - " config_ph2 = json.load(f)\n", - "\n", - " mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model),config_dict = config_ph2)\n", + " # NOTE: the original (v1) version contained loading a different config\n", + " # for each phase, but I do not know how these files would have been saved there\n", + " # and thus don't know what the indent was\n", + " # config_ph_2_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph2.json\")\n", + " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", + " meta_cat = load_meta_cat_from_file(meta_cat_path)\n", + " mc = meta_cat.mc\n", "\n", " if class_wt_phase2:\n", - " mc.config.train['class_weights'] = class_wt_phase2\n", + " mc.config.train.class_weights = class_wt_phase2\n", "\n", - " mc.config.train['nepochs'] = 15\n", + " mc.config.train.nepochs = 15\n", "\n", " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n", " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", @@ -256,8 +320,10 @@ "# To run the training with original + synthetic data\n", "# Follow all the same steps till initializing the metacat model\n", "\n", + "config = ConfigMetaCAT()\n", + "\n", "# Initialise and train meta_model\n", - "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n", + "mc = MetaCATAddon.create_new(config, base_tokenizer)\n", "\n", "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n", "# ['text','of','the','document'], [index of medical entity], \"label\" ]]\n", From ab45464d6793dcb438d9687605c7ff4297593d3f Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 15:15:47 +0100 Subject: [PATCH 48/79] CU-8699049kf: Update run_model script for v2 --- medcat/3_run_model/run_model.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/medcat/3_run_model/run_model.py b/medcat/3_run_model/run_model.py index 2a840ab..44e802c 100644 --- a/medcat/3_run_model/run_model.py +++ b/medcat/3_run_model/run_model.py @@ -57,9 +57,9 @@ if snomed_filter_path: snomed_filter = set(json.load(open(snomed_filter_path))) else: - snomed_filter = set(cat.cdb.cui2preferred_name.keys()) + snomed_filter = set(cat.cdb.cui2info.keys()) -cat.config.linking['filters']['cuis'] = snomed_filter +cat.config.linking.filters.cuis = snomed_filter del snomed_filter # build query, change as appropriate @@ -82,14 +82,16 @@ def relevant_text_gen(generator, doc_id = '_id', text_col='body_analysed'): batch_char_size = 500000 # Batch size (BS) in number of characters -cat.multiprocessing_batch_char_size(relevant_text_gen(search_gen), - batch_size_chars=batch_char_size, - only_cui=False, - nproc=8, # Number of processors - out_split_size_chars=20*batch_char_size, - save_dir_path=ann_folder_path, - min_free_memory=0.1, - ) +# NOTE: no multiprocessing in v2 right now +for text in relevant_text_gen(search_gen): + cat.get_entities(text, + batch_size_chars=batch_char_size, + only_cui=False, + nproc=8, # Number of processors + out_split_size_chars=20*batch_char_size, + save_dir_path=ann_folder_path, + min_free_memory=0.1, + ) medcat_logger.warning(f'Annotation process complete!') From 1755848d798fe8907b19c698595f945427cf9fe2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 15:19:51 +0100 Subject: [PATCH 49/79] CU-8699049kf: Update run_model script for v2 (remove unavailable keyword arguments) --- medcat/3_run_model/run_model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/medcat/3_run_model/run_model.py b/medcat/3_run_model/run_model.py index 44e802c..0ee31e2 100644 --- a/medcat/3_run_model/run_model.py +++ b/medcat/3_run_model/run_model.py @@ -85,12 +85,12 @@ def relevant_text_gen(generator, doc_id = '_id', text_col='body_analysed'): # NOTE: no multiprocessing in v2 right now for text in relevant_text_gen(search_gen): cat.get_entities(text, - batch_size_chars=batch_char_size, - only_cui=False, - nproc=8, # Number of processors - out_split_size_chars=20*batch_char_size, - save_dir_path=ann_folder_path, - min_free_memory=0.1, + # batch_size_chars=batch_char_size, + # only_cui=False, + # nproc=8, # Number of processors + # out_split_size_chars=20*batch_char_size, + # save_dir_path=ann_folder_path, + # min_free_memory=0.1, ) medcat_logger.warning(f'Annotation process complete!') From 3cb57cddd7956c9208cd3f6584cc6f6f11c4dfa8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 15:28:11 +0100 Subject: [PATCH 50/79] CU-8699049kf: Update run_model notebook for v2 as best as possible --- medcat/3_run_model/run_model.ipynb | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb index 23964ed..54daa10 100755 --- a/medcat/3_run_model/run_model.ipynb +++ b/medcat/3_run_model/run_model.ipynb @@ -120,9 +120,9 @@ " snomed_filter = set(json.load(open(snomed_filter_path)))\n", "else:\n", " print('There is no concept filter set')\n", - " snomed_filter = set(cat.cdb.cui2preferred_name.keys())\n", + " snomed_filter = set(cat.cdb.cui2info.keys())\n", "\n", - "cat.config.linking['filters']['cuis'] = snomed_filter \n" + "cat.config.linking.filters.cuis = snomed_filter \n" ] }, { @@ -155,14 +155,14 @@ "outputs": [], "source": [ "batch_char_size = 50000 # Batch size (BS) in number of characters\n", - "cat.multiprocessing_batch_char_size(data_iterator(df, doc_id_column, doc_text_column),\n", - " batch_size_chars=batch_char_size,\n", - " only_cui=False,\n", - " nproc=8, # Number of processors\n", - " out_split_size_chars=20*batch_char_size,\n", - " save_dir_path=ann_folder_path,\n", - " min_free_memory=0.1,\n", - " )\n", + "for text_id, text in data_iterator(df, doc_id_column, doc_text_column):\n", + " cat.get_entities(text,\n", + " only_cui=False,\n", + " # nproc=8, # Number of processors\n", + " # out_split_size_chars=20*batch_char_size,\n", + " # save_dir_path=ann_folder_path,\n", + " # min_free_memory=0.1,\n", + " )\n", "\n", "medcat_logger.warning(f'Annotation process complete!')\n" ] @@ -225,7 +225,7 @@ "source": [ "text = \"He was diagnosed with heart failure\"\n", "doc = cat(text)\n", - "print(doc.ents)" + "print(doc.final_ents)" ] }, { @@ -235,8 +235,8 @@ "outputs": [], "source": [ "# Display Snomed codes\n", - "for ent in doc.ents:\n", - " print(ent, \" - \", ent._.cui, \" - \", cat.cdb.cui2preferred_name[ent._.cui])" + "for ent in doc.final_ents:\n", + " print(ent, \" - \", ent.cui, \" - \", cat.cdb.cui2info[ent.cui]['preferred_name'])" ] }, { @@ -246,8 +246,8 @@ "outputs": [], "source": [ "# To show semantic types for each entity\n", - "for ent in doc.ents:\n", - " print(ent, \" - \", cat.cdb.cui2type_ids.get(ent._.cui))" + "for ent in doc.final_ents:\n", + " print(ent, \" - \", cat.cdb.cui2info[ent.cui]['type_ids'])" ] }, { @@ -258,7 +258,7 @@ "source": [ "# Display\n", "from spacy import displacy\n", - "displacy.render(doc, style='ent', jupyter=True)" + "displacy.render(doc._delegate, style='ent', jupyter=True)" ] }, { @@ -298,7 +298,7 @@ "metadata": {}, "outputs": [], "source": [ - "cat.cdb.print_stats()" + "cat.cdb.get_basic_info()" ] }, { From 149f5b1dd67f7456b8dc23e6a5824510d394f004 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 15:43:18 +0100 Subject: [PATCH 51/79] CU-8699049kf: Update (most of) mct_analysis to v2 --- medcat/evaluate_mct_export/mct_analysis.py | 41 +++++++++++++++++----- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index cc35707..83d2ef2 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -3,6 +3,7 @@ from medcat.cat import CAT from datetime import date +import os import json import torch import math @@ -15,9 +16,13 @@ from medcat.utils.meta_cat.ml_utils import create_batch_piped_data -from medcat.meta_cat import MetaCAT -from medcat.config_meta_cat import ConfigMetaCAT -from medcat.utils.meta_cat.data_utils import prepare_from_json, encode_category_values +from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon, MetaCAT +from medcat.stats.stats import get_stats +from medcat.utils.legacy.identifier import is_legacy_model_pack +from medcat.utils.legacy.convert_meta_cat import get_meta_cat_from_old +from medcat.config.config_meta_cat import ConfigMetaCAT +from medcat.components.addons.meta_cat.data_utils import prepare_from_json, encode_category_values +from medcat.storage.serialisers import deserialise import warnings @@ -35,8 +40,11 @@ def __init__(self, mct_export_paths: List[str], model_pack_path: Optional[str] = :param model_pack_path: Path to medcat modelpack """ self.cat: Optional[CAT] = None + self.is_legacy_model_pack = False if model_pack_path: self.cat = CAT.load_model_pack(model_pack_path) + mpp = model_pack_path.removesuffix(".zip") + self.is_legacy_model_pack = is_legacy_model_pack(mpp) self.mct_export_paths = mct_export_paths self.mct_export = self._load_mct_exports(self.mct_export_paths) self.project_names: List[str] = [] @@ -103,7 +111,7 @@ def annotation_df(self) -> pd.DataFrame: """ annotation_df = pd.DataFrame(self.annotations) if self.cat: - annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(self.cat.cdb.cui2preferred_name)) + annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(lambda cui: self.cat.cdb.get_name(cui))) exceptions: List[ValueError] = [] # try the default format as well as the format specified above for format in [None, DATETIME_FORMAT]: @@ -137,9 +145,10 @@ def concept_summary(self, extra_cui_filter: Optional[str] = None) -> pd.DataFram concept_count_df['count_variations_ratio'] = round(concept_count_df['concept_count'] / concept_count_df['variations'], 3) if self.cat: - fps,fns,tps,cui_prec,cui_rec,cui_f1,cui_counts,examples = self.cat._print_stats(data=self.mct_export, - use_project_filters=True, - extra_cui_filter=extra_cui_filter) + fps,fns,tps,cui_prec,cui_rec,cui_f1,cui_counts,examples = get_stats(self.cat, + data=self.mct_export, + use_project_filters=True, + extra_cui_filter=extra_cui_filter) concept_count_df['fps'] = concept_count_df['cui'].map(fps) concept_count_df['fns'] = concept_count_df['cui'].map(fns) concept_count_df['tps'] = concept_count_df['cui'].map(tps) @@ -339,7 +348,21 @@ def full_annotation_df(self) -> pd.DataFrame: for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']: meta_model = meta_model_card['Category Name'] print(f'Checking metacat model: {meta_model}') - _meta_model = MetaCAT.load(self.model_pack_path + '/meta_' + meta_model) + if self.is_legacy_model_pack: + meta_cat = get_meta_cat_from_old( + self.model_pack_path + '/meta_' + meta_model, self.cat._pipeline._tokenizer) + else: + meta_model_path = os.path.join( + self.model_pack_path, "saved_components", f"addon_meta_cat.{meta_model}") + # NOTE: the expected workflow when loading the model + # is one where the config is stored as part of the overall config + # and thus using it for loading is trivial + # but here we need to manually load the config from disk + config_path = os.path.join(meta_model_path, "meta_cat", "config") + cnf: ConfigMetaCAT = deserialise(config_path) + _meta_model = MetaCATAddon.load_existing( + cnf, self.cat._pipeline._tokenizer, meta_model_path) + _meta_model = meta_cat.mc meta_results = self._eval(_meta_model, self.mct_export) _meta_values = {v: k for k, v in meta_results['meta_values'].items()} pred_meta_values = [] @@ -409,7 +432,7 @@ def meta_anns_concept_summary(self) -> pd.DataFrame: meta_anns_df['total_anns'] = meta_anns_df[col_lst].sum(axis=1) meta_anns_df = meta_anns_df.sort_values(by='total_anns', ascending=False) meta_anns_df = meta_anns_df.rename_axis('cui').reset_index(drop=False) - meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(self.cat.cdb.cui2preferred_name)) + meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(lambda cui: self.cat.cdb.get_name(cui))) return meta_anns_df def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept_filter: Optional[List] = None): From 58aaebc220b927b69c8d98fa0c393472053d4da8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 15 May 2025 15:51:44 +0100 Subject: [PATCH 52/79] CU-8699049kf: Fix a few minor typing issues --- medcat/evaluate_mct_export/mct_analysis.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index 83d2ef2..7e974ab 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -2,6 +2,7 @@ import plotly.graph_objects as go from medcat.cat import CAT from datetime import date +from typing import cast import os import json @@ -111,7 +112,7 @@ def annotation_df(self) -> pd.DataFrame: """ annotation_df = pd.DataFrame(self.annotations) if self.cat: - annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(lambda cui: self.cat.cdb.get_name(cui))) + annotation_df.insert(5, 'concept_name', annotation_df['cui'].map(lambda cui: cast(CAT, self.cat).cdb.get_name(cui))) exceptions: List[ValueError] = [] # try the default format as well as the format specified above for format in [None, DATETIME_FORMAT]: @@ -432,7 +433,7 @@ def meta_anns_concept_summary(self) -> pd.DataFrame: meta_anns_df['total_anns'] = meta_anns_df[col_lst].sum(axis=1) meta_anns_df = meta_anns_df.sort_values(by='total_anns', ascending=False) meta_anns_df = meta_anns_df.rename_axis('cui').reset_index(drop=False) - meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(lambda cui: self.cat.cdb.get_name(cui))) + meta_anns_df.insert(1, 'concept_name', meta_anns_df['cui'].map(lambda cui: cast(CAT, self.cat).cdb.get_name(cui))) return meta_anns_df def generate_report(self, path: str = 'mct_report.xlsx', meta_ann=False, concept_filter: Optional[List] = None): From 09ca315adeafbc19f117ff313bbfc5f2602c89f9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 10:41:23 +0100 Subject: [PATCH 53/79] CU-8699049kf: Update to latest v2 release (0.5.0) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 920a672..ba54974 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat2[meta-cat,spacy,deid] @ git+https://github.com/CogStack/MedCAT2@v0.3.4 +medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.5.0 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From 05c8e644c9cf10bd9c53e3d6e3cc3d5147614bb7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 10:43:03 +0100 Subject: [PATCH 54/79] CU-8699049kf: Remove compatibility layer (no longer needed) --- compatibility_package/medcat.py | 68 --------------------------------- compatibility_package/setup.py | 9 ----- requirements.txt | 1 - 3 files changed, 78 deletions(-) delete mode 100644 compatibility_package/medcat.py delete mode 100644 compatibility_package/setup.py diff --git a/compatibility_package/medcat.py b/compatibility_package/medcat.py deleted file mode 100644 index 5b1a918..0000000 --- a/compatibility_package/medcat.py +++ /dev/null @@ -1,68 +0,0 @@ -import sys -import os -import importlib - -import medcat2 -import medcat2.storage.serialisers -import medcat2.utils.legacy.convert_cdb as convert_cdb -import medcat2.utils.legacy.convert_vocab as convert_vocab - -# Copy all attributes from medcat2 to this module -for attr in dir(medcat2): - if not attr.startswith('__'): - globals()[attr] = getattr(medcat2, attr) - - -# Set up submodule redirections -class SubmoduleProxy: - def __init__(self, target_module_name): - self.target_module_name = target_module_name - - def __getattr__(self, name): - return getattr(importlib.import_module(self.target_module_name), name) - - -manual_changes = { - "medcat.tokenizers.meta_cat_tokenizers": "medcat2.components.addons.meta_cat.mctokenizers.tokenizers", - "medcat.cdb_maker": "medcat2.model_creation.cdb_maker", - "medcat.utils.meta_cat": "medcat2.components.addons.meta_cat", - "medcat.meta_cat": "medcat2.components.addons.meta_cat.meta_cat", - "medcat.config_meta_cat": "medcat2.config.config_meta_cat", -} - - -# For each submodule in medcat2, create a proxy in sys.modules -for module_name in list(sys.modules.keys()): - if (module_name.startswith('medcat2.') and - not module_name.startswith('medcat.')): - submodule_name = module_name.replace('medcat2.', 'medcat.', 1) - elif module_name == 'medcat2': - submodule_name = 'medcat' - else: - continue - sys.modules[submodule_name] = SubmoduleProxy(module_name) # type: ignore - -for module_name, replacement_module_name in manual_changes.items(): - sys.modules[module_name] = SubmoduleProxy(replacement_module_name) # type: ignore - -# add automatic vocab / CDB conversion -_orig_deserialise = medcat2.storage.serialisers.deserialise - - -def deserialise_with_legacy_conversion( - folder_path: str, - ignore_folders_prefix: set[str] = set(), - ignore_folders_suffix: set[str] = set(), - **init_kwargs): - if not os.path.isdir(folder_path): - if folder_path.endswith("cdb.dat"): - print("Trying to legacy convert CDB from", folder_path) - return convert_cdb.get_cdb_from_old(folder_path) - elif folder_path.endswith("vocab.dat"): - print("Trying to legacy convert Vocab from", folder_path) - return convert_vocab.get_vocab_from_old(folder_path) - return _orig_deserialise( - folder_path, ignore_folders_prefix, ignore_folders_suffix, **init_kwargs) - - -medcat2.storage.serialisers.deserialise = deserialise_with_legacy_conversion diff --git a/compatibility_package/setup.py b/compatibility_package/setup.py deleted file mode 100644 index 12f2205..0000000 --- a/compatibility_package/setup.py +++ /dev/null @@ -1,9 +0,0 @@ -from setuptools import setup - -setup( - name="medcat", - version="2.0.0-beta", - description="Compatibility layer for medcat2", - py_modules=["medcat"], - install_requires=["medcat2"], -) diff --git a/requirements.txt b/requirements.txt index ba54974..d4e9cae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,3 @@ plotly~=5.19.0 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl ipyfilechooser jupyter_contrib_nbextensions --e ./compatibility_package From 2ec14aeba701559b2a7fc13ca74326d2b6163a60 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 11:39:01 +0100 Subject: [PATCH 55/79] CU-8699049kf: Update for legacy CDB/Vocab load --- .../create_modelpack/create_modelpack.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/medcat/1_create_model/create_modelpack/create_modelpack.py b/medcat/1_create_model/create_modelpack/create_modelpack.py index da9b5eb..c7193e6 100644 --- a/medcat/1_create_model/create_modelpack/create_modelpack.py +++ b/medcat/1_create_model/create_modelpack/create_modelpack.py @@ -40,7 +40,12 @@ def load_cdb_and_save_modelpack(cdb_path: str, str: The model pack path. """ # Load cdb - cdb: CDB = deserialise(cdb_path) + cdb: CDB + try: + cdb = deserialise(cdb_path) + except NotADirectoryError: + from medcat.utils.legacy.convert_cdb import get_cdb_from_old + cdb = get_cdb_from_old(cdb_path) # Set cdb configuration # technically we already created this during the cdb creation @@ -55,6 +60,11 @@ def load_cdb_and_save_modelpack(cdb_path: str, # Load vocab vocab: Vocab = deserialise(vocab_path) + try: + vocab = deserialise(vocab_path) + except NotADirectoryError: + from medcat.utils.legacy.convert_vocab import get_vocab_from_old + vocab = get_vocab_from_old(cdb_path) # Initialise the model cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) From 2249cd205e6e18cfe6ff296c3a8421c5624b42f3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 11:40:38 +0100 Subject: [PATCH 56/79] CU-8699049kf: Fix imports when creating CDBs --- medcat/1_create_model/create_cdb/create_cdb.py | 2 +- medcat/1_create_model/create_cdb/create_umls_cdb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/1_create_model/create_cdb/create_cdb.py b/medcat/1_create_model/create_cdb/create_cdb.py index e761e19..11b895b 100644 --- a/medcat/1_create_model/create_cdb/create_cdb.py +++ b/medcat/1_create_model/create_cdb/create_cdb.py @@ -1,7 +1,7 @@ import os import pandas as pd from medcat.config import Config -from medcat.cdb_maker import CDBMaker +from medcat.model_creation.cdb_maker import CDBMaker from medcat.storage.serialisers import serialise, AvailableSerialisers pd.options.mode.chained_assignment = None # type: ignore diff --git a/medcat/1_create_model/create_cdb/create_umls_cdb.py b/medcat/1_create_model/create_cdb/create_umls_cdb.py index 939a1c4..73314d6 100644 --- a/medcat/1_create_model/create_cdb/create_umls_cdb.py +++ b/medcat/1_create_model/create_cdb/create_umls_cdb.py @@ -1,7 +1,7 @@ import os import pandas as pd from medcat.config import Config -from medcat.cdb_maker import CDBMaker +from medcat.model_creation.cdb_maker import CDBMaker from medcat.storage.serialisers import serialise, AvailableSerialisers pd.options.mode.chained_assignment = None # type: ignore From 29e3c9d059c77ad389540fb1c3a0c5ff12a247ab Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 11:43:40 +0100 Subject: [PATCH 57/79] CU-8699049kf: Fix imports in MCT analysis --- medcat/evaluate_mct_export/mct_analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index 7e974ab..008474e 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -13,9 +13,9 @@ import pandas as pd from collections import Counter from typing import List, Dict, Iterator, Tuple, Optional, Union -from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase +from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBase -from medcat.utils.meta_cat.ml_utils import create_batch_piped_data +from medcat.components.addons.meta_cat.ml_utils import create_batch_piped_data from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon, MetaCAT from medcat.stats.stats import get_stats From 595f30df4a594d4aca40e4a72d068e030f4bb2b2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 11:52:03 +0100 Subject: [PATCH 58/79] CU-8699049kf: Remove accidental force-deserialisation --- medcat/1_create_model/create_modelpack/create_modelpack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/1_create_model/create_modelpack/create_modelpack.py b/medcat/1_create_model/create_modelpack/create_modelpack.py index c7193e6..8cc2382 100644 --- a/medcat/1_create_model/create_modelpack/create_modelpack.py +++ b/medcat/1_create_model/create_modelpack/create_modelpack.py @@ -59,7 +59,7 @@ def load_cdb_and_save_modelpack(cdb_path: str, cdb.config.general.full_unlink = True # Load vocab - vocab: Vocab = deserialise(vocab_path) + vocab: Vocab try: vocab = deserialise(vocab_path) except NotADirectoryError: From 3e0b7eccd38c52044863ddc8797d0532f698b2e7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 12:44:24 +0100 Subject: [PATCH 59/79] CU-8699049kf: Fix typo in Vocab load path --- medcat/1_create_model/create_modelpack/create_modelpack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/1_create_model/create_modelpack/create_modelpack.py b/medcat/1_create_model/create_modelpack/create_modelpack.py index 8cc2382..ec2f603 100644 --- a/medcat/1_create_model/create_modelpack/create_modelpack.py +++ b/medcat/1_create_model/create_modelpack/create_modelpack.py @@ -64,7 +64,7 @@ def load_cdb_and_save_modelpack(cdb_path: str, vocab = deserialise(vocab_path) except NotADirectoryError: from medcat.utils.legacy.convert_vocab import get_vocab_from_old - vocab = get_vocab_from_old(cdb_path) + vocab = get_vocab_from_old(vocab_path) # Initialise the model cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) From 3d2269dd6d78bde1fab7e5ae233ebaac03a6f1d4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 9 Jun 2025 12:53:40 +0100 Subject: [PATCH 60/79] CU-8699049kf: Bump to latest v2 release (v0.5.1) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d4e9cae..ea00bce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.5.0 +medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.5.1 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From d0813e5227f96f3abd24f167641d4b83edda164f Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 10 Jun 2025 16:45:17 +0100 Subject: [PATCH 61/79] CU-8699049kf: Update requirements to v0.6.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ea00bce..73d5da1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.5.1 +medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.6.0 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From 322de496c6ea68c15cf2cfa73e93386b2ae82502 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 11 Jun 2025 15:37:58 +0100 Subject: [PATCH 62/79] CU-8699049kf: Update requirements to v0.6.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 73d5da1..d884c29 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.6.0 +medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.6.1 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From 3b84efa903a45eb54d9466e3a601c339ad2cb3f8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 12 Jun 2025 20:47:38 +0100 Subject: [PATCH 63/79] CU-8699049kf: Update dependency to 0.7.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d884c29..b673ae9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.6.1 +medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.7.0 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From c88dbe50e4fb9ca278efe4e455037584b8540036 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 12 Jun 2025 20:49:08 +0100 Subject: [PATCH 64/79] CU-8699049kf: Update ents property after name change --- medcat/3_run_model/run_model.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb index 54daa10..18352b0 100755 --- a/medcat/3_run_model/run_model.ipynb +++ b/medcat/3_run_model/run_model.ipynb @@ -225,7 +225,7 @@ "source": [ "text = \"He was diagnosed with heart failure\"\n", "doc = cat(text)\n", - "print(doc.final_ents)" + "print(doc.linked_ents)" ] }, { @@ -235,7 +235,7 @@ "outputs": [], "source": [ "# Display Snomed codes\n", - "for ent in doc.final_ents:\n", + "for ent in doc.linked_ents:\n", " print(ent, \" - \", ent.cui, \" - \", cat.cdb.cui2info[ent.cui]['preferred_name'])" ] }, @@ -246,7 +246,7 @@ "outputs": [], "source": [ "# To show semantic types for each entity\n", - "for ent in doc.final_ents:\n", + "for ent in doc.linked_ents:\n", " print(ent, \" - \", cat.cdb.cui2info[ent.cui]['type_ids'])" ] }, From de0be557ec08da248ee01a7e3443371beeb35cf5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 12 Jun 2025 20:50:48 +0100 Subject: [PATCH 65/79] CU-8699049kf: Update for use of convenience method for CDB/Vocab loading --- medcat/1_create_model/create_modelpack/create_modelpack.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/medcat/1_create_model/create_modelpack/create_modelpack.py b/medcat/1_create_model/create_modelpack/create_modelpack.py index ec2f603..750af4b 100644 --- a/medcat/1_create_model/create_modelpack/create_modelpack.py +++ b/medcat/1_create_model/create_modelpack/create_modelpack.py @@ -3,7 +3,6 @@ from medcat.vocab import Vocab from medcat.cdb import CDB from medcat.cat import CAT -from medcat.storage.serialisers import deserialise # relative to file path _FILE_DIR = os.path.dirname(__file__) @@ -42,7 +41,7 @@ def load_cdb_and_save_modelpack(cdb_path: str, # Load cdb cdb: CDB try: - cdb = deserialise(cdb_path) + cdb = CDB.load(cdb_path) except NotADirectoryError: from medcat.utils.legacy.convert_cdb import get_cdb_from_old cdb = get_cdb_from_old(cdb_path) @@ -61,7 +60,7 @@ def load_cdb_and_save_modelpack(cdb_path: str, # Load vocab vocab: Vocab try: - vocab = deserialise(vocab_path) + vocab = Vocab.load(vocab_path) except NotADirectoryError: from medcat.utils.legacy.convert_vocab import get_vocab_from_old vocab = get_vocab_from_old(vocab_path) From 26ee9f256165874c93a6329d3fa753da89cc9eb8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 13 Jun 2025 10:46:07 +0100 Subject: [PATCH 66/79] CU-8699049kf: Update to v0.8.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b673ae9..fa8556b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.7.0 +medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.8.0 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From a2e972649ee6cc83aa09c39839a93db06339d468 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 13:43:40 +0100 Subject: [PATCH 67/79] CU-8699049kf: Update dependency to latest (PyPI beta) release --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fa8556b..8824952 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat[meta-cat,spacy,deid,rel-cat] @ git+https://github.com/CogStack/MedCAT2@v0.8.0 +medcat[meta-cat,spacy,deid,rel-cat]~=2.0.0b3 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl From 423dab9902a83c0a0f70faf33d6d40bcb1050b3b Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 13:54:28 +0100 Subject: [PATCH 68/79] CU-8699049kf: Fix a few v2 vs v2 access issues --- .../unsupervised_medcattraining.py | 17 +++++++++-------- .../unsupervised_training.py | 13 +++++++------ medcat/3_run_model/run_model.py | 2 +- medcat/compare_models/compare.py | 4 ++-- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py b/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py index 89c9b51..561465c 100644 --- a/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py +++ b/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py @@ -28,13 +28,14 @@ df = cs.DataFrame(index=cogstack_indices, columns=text_columns) # type: ignore cat = CAT.load_model_pack(model_pack_path+model_pack_name) -cat.cdb.print_stats() -cat.train(data_iterator=df[text_columns].iterrows(), - nepochs=1, - fine_tune=True, - progress_print=10000, - is_resumed=False) +print(cat.cdb.get_basic_info()) +cat.trainer.train_unsupervised( + data_iterator=df[text_columns].iterrows(), + nepochs=1, + fine_tune=True, + progress_print=10000, + is_resumed=False) -cat.cdb.print_stats() +print(cat.cdb.get_basic_info()) -cat.create_model_pack(save_dir_path=model_pack_path, model_pack_name=output_modelpack_name) +cat.save_model_pack(save_dir_path=model_pack_path, model_pack_name=output_modelpack_name) diff --git a/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py b/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py index 9a202e0..1d2d567 100644 --- a/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py +++ b/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py @@ -1,3 +1,4 @@ +from medcat.cat import logger as cat_logger from medcat.cat import CAT import pandas as pd import os @@ -44,10 +45,10 @@ # Load modelpack print('Loading modelpack') cat = CAT.load_model_pack(model_pack_path) -cat.log.addHandler(logging.StreamHandler()) # add console output +cat_logger.addHandler(logging.StreamHandler()) # add console output print('STATS:') -cat.cdb.print_stats() +print(cat.cdb.get_basic_info()) # CHANGE AS NEEDED - if the number of spligt files is different all_data_files = [f'split_notes_5M_{nr}.csv' for nr in range(1, 20)] # file containing training material. @@ -55,14 +56,14 @@ # Load training data print('Load data for', i, 'from', data_file) data = pd.read_csv(os.path.join(data_dir, data_file)) - cat.train(data.text.values, progress_print=100) + cat.trainer.train_unsupervised(data.text.values, progress_print=100) print('Stats now, after', i) - cat.cdb.print_stats() + print(cat.cdb.get_basic_info()) # save modelpack - cat.create_model_pack(save_dir_path=model_dir, model_pack_name=f"{output_modelpack}_{i}") + cat.save_model_pack(save_dir_path=model_dir, model_pack_name=f"{output_modelpack}_{i}") # save modelpack - ALL -cat.create_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack) +cat.save_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack) diff --git a/medcat/3_run_model/run_model.py b/medcat/3_run_model/run_model.py index 0ee31e2..5e9ab0d 100644 --- a/medcat/3_run_model/run_model.py +++ b/medcat/3_run_model/run_model.py @@ -59,7 +59,7 @@ else: snomed_filter = set(cat.cdb.cui2info.keys()) -cat.config.linking.filters.cuis = snomed_filter +cat.config.components.linking.filters.cuis = snomed_filter del snomed_filter # build query, change as appropriate diff --git a/medcat/compare_models/compare.py b/medcat/compare_models/compare.py index e6ad450..5110c92 100644 --- a/medcat/compare_models/compare.py +++ b/medcat/compare_models/compare.py @@ -154,8 +154,8 @@ def get_diffs_for(model_pack_path_1: str, if show_progress: print("After adding children from 2nd model have a total of", len(cui_filter), "CUIs") - cat1.config.linking.filters.cuis = cui_filter - cat2.config.linking.filters.cuis = cui_filter + cat1.config.components.linking.filters.cuis = cui_filter + cat2.config.components.linking.filters.cuis = cui_filter ann_diffs = get_per_annotation_diffs(cat1, cat2, documents, keep_raw=keep_raw, doc_limit=doc_limit) if show_progress: From 73d343f30a56c2b08813b32cadebc678afba0c89 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 13:59:30 +0100 Subject: [PATCH 69/79] CU-8699049kf: Fix a few more v2 vs v2 access issues --- medcat/evaluate_mct_export/mct_analysis.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index 008474e..e8bade4 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -147,9 +147,10 @@ def concept_summary(self, extra_cui_filter: Optional[str] = None) -> pd.DataFram concept_count_df['variations'], 3) if self.cat: fps,fns,tps,cui_prec,cui_rec,cui_f1,cui_counts,examples = get_stats(self.cat, - data=self.mct_export, + data=self.mct_export, # type: ignore use_project_filters=True, - extra_cui_filter=extra_cui_filter) + # extra_cui_filter=extra_cui_filter + ) concept_count_df['fps'] = concept_count_df['cui'].map(fps) concept_count_df['fns'] = concept_count_df['cui'].map(fns) concept_count_df['tps'] = concept_count_df['cui'].map(tps) @@ -262,11 +263,11 @@ def rename_meta_anns(self, meta_anns2rename: dict = dict(), meta_ann_values2rena return def _eval_model(self, model: nn.Module, data: List, config: ConfigMetaCAT, tokenizer: TokenizerWrapperBase) -> Dict: - device = torch.device(config.general['device']) # Create a torch device - batch_size_eval = config.general['batch_size_eval'] - pad_id = config.model['padding_idx'] - ignore_cpos = config.model['ignore_cpos'] - class_weights = config.train['class_weights'] + device = torch.device(config.general.device) # Create a torch device + batch_size_eval = config.general.batch_size_eval + pad_id = config.model.padding_idx + ignore_cpos = config.model.ignore_cpos + class_weights = config.train.class_weights if class_weights is not None: class_weights = torch.FloatTensor(class_weights).to(device) @@ -360,11 +361,11 @@ def full_annotation_df(self) -> pd.DataFrame: # and thus using it for loading is trivial # but here we need to manually load the config from disk config_path = os.path.join(meta_model_path, "meta_cat", "config") - cnf: ConfigMetaCAT = deserialise(config_path) + cnf: ConfigMetaCAT = deserialise(config_path) # type: ignore _meta_model = MetaCATAddon.load_existing( cnf, self.cat._pipeline._tokenizer, meta_model_path) - _meta_model = meta_cat.mc - meta_results = self._eval(_meta_model, self.mct_export) + meta_cat = _meta_model.mc + meta_results = self._eval(meta_cat, self.mct_export) _meta_values = {v: k for k, v in meta_results['meta_values'].items()} pred_meta_values = [] counter = 0 From 10e4bb34015e8999b7a3bdff2e9f81e1a78c48d8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 14:32:53 +0100 Subject: [PATCH 70/79] CU-8699049kf: Fix a few more v2 vs v2 access issues (unsup training) --- .../1_unsupervised_training/unsupervised_medcattraining.py | 2 +- .../1_unsupervised_training/unsupervised_training.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py b/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py index 561465c..d82d7f6 100644 --- a/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py +++ b/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py @@ -38,4 +38,4 @@ print(cat.cdb.get_basic_info()) -cat.save_model_pack(save_dir_path=model_pack_path, model_pack_name=output_modelpack_name) +cat.save_model_pack(target_folder=model_pack_path, pack_name=output_modelpack_name) diff --git a/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py b/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py index 1d2d567..b62967d 100644 --- a/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py +++ b/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py @@ -62,8 +62,8 @@ print(cat.cdb.get_basic_info()) # save modelpack - cat.save_model_pack(save_dir_path=model_dir, model_pack_name=f"{output_modelpack}_{i}") + cat.save_model_pack(target_folder=model_dir, pack_name=f"{output_modelpack}_{i}") # save modelpack - ALL -cat.save_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack) +cat.save_model_pack(target_folder=model_dir, pack_name=output_modelpack) From 6ed23e0a63ee88b95edc0e42d5f8e899bb15248c Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 14:41:43 +0100 Subject: [PATCH 71/79] CU-8699049kf: Fix a few more v2 vs v2 access issues (mct_analysis) --- medcat/evaluate_mct_export/mct_analysis.py | 24 +++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index e8bade4..f4ea8be 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -347,11 +347,11 @@ def full_annotation_df(self) -> pd.DataFrame: & (anns_df['irrelevant'] != True)] meta_df = meta_df.reset_index(drop=True) - for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']: - meta_model = meta_model_card['Category Name'] + for meta_model_category in self.cat.get_model_card(as_dict=True)['MetaCAT models']: + meta_model = meta_model_category print(f'Checking metacat model: {meta_model}') if self.is_legacy_model_pack: - meta_cat = get_meta_cat_from_old( + _meta_model = get_meta_cat_from_old( self.model_pack_path + '/meta_' + meta_model, self.cat._pipeline._tokenizer) else: meta_model_path = os.path.join( @@ -364,7 +364,7 @@ def full_annotation_df(self) -> pd.DataFrame: cnf: ConfigMetaCAT = deserialise(config_path) # type: ignore _meta_model = MetaCATAddon.load_existing( cnf, self.cat._pipeline._tokenizer, meta_model_path) - meta_cat = _meta_model.mc + meta_cat = _meta_model.mc meta_results = self._eval(meta_cat, self.mct_export) _meta_values = {v: k for k, v in meta_results['meta_values'].items()} pred_meta_values = [] @@ -393,12 +393,22 @@ def meta_anns_concept_summary(self) -> pd.DataFrame: for cui in meta_df.cui.unique(): temp_meta_df = meta_df[meta_df['cui'] == cui] meta_task_results = {} - for meta_model_card in self.cat.get_model_card(as_dict=True)['MetaCAT models']: - meta_task = meta_model_card['Category Name'] + for meta_task in self.cat.get_model_card(as_dict=True)['MetaCAT models']: list_meta_anns = list(zip(temp_meta_df[meta_task], temp_meta_df['predict_' + meta_task])) counter_meta_anns = Counter(list_meta_anns) meta_value_results: Dict[Tuple[Dict, str, str], Union[int, float]] = {} - for meta_value in meta_model_card['Classes'].keys(): + meta_cats: list[MetaCATAddon] = [ + addon for addon in + self.cat._pipeline.iter_addons() + if (isinstance(addon, MetaCATAddon) and + addon.config.comp_name == meta_task) + ] + if len(meta_cats) != 1: + raise ValueError( + f"Unable to uniquely identify meta task {meta_task}. " + f"Found {len(meta_cats)} options") + meta_cat = meta_cats[0] + for meta_value in meta_cat.config.general.category_value2id.keys(): total = 0 fp = 0 fn = 0 From 474c8c1a43be9b34addfa9f6862cede0221cdd6a Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 14:55:23 +0100 Subject: [PATCH 72/79] CU-8699049kf: Fix a few more type issues in mct_analysis --- medcat/evaluate_mct_export/mct_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index f4ea8be..e31150b 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -396,7 +396,7 @@ def meta_anns_concept_summary(self) -> pd.DataFrame: for meta_task in self.cat.get_model_card(as_dict=True)['MetaCAT models']: list_meta_anns = list(zip(temp_meta_df[meta_task], temp_meta_df['predict_' + meta_task])) counter_meta_anns = Counter(list_meta_anns) - meta_value_results: Dict[Tuple[Dict, str, str], Union[int, float]] = {} + meta_value_results: Dict[Tuple[str, str, str], Union[int, float]] = {} meta_cats: list[MetaCATAddon] = [ addon for addon in self.cat._pipeline.iter_addons() From f0a387487a08ea34c2014f20a8e1272b23c1eb1d Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 14:56:11 +0100 Subject: [PATCH 73/79] CU-8699049kf: Add TODO comment --- medcat/evaluate_mct_export/mct_analysis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/medcat/evaluate_mct_export/mct_analysis.py b/medcat/evaluate_mct_export/mct_analysis.py index e31150b..a28cb1c 100644 --- a/medcat/evaluate_mct_export/mct_analysis.py +++ b/medcat/evaluate_mct_export/mct_analysis.py @@ -397,6 +397,7 @@ def meta_anns_concept_summary(self) -> pd.DataFrame: list_meta_anns = list(zip(temp_meta_df[meta_task], temp_meta_df['predict_' + meta_task])) counter_meta_anns = Counter(list_meta_anns) meta_value_results: Dict[Tuple[str, str, str], Union[int, float]] = {} + # TODO: maybe make this easier? meta_cats: list[MetaCATAddon] = [ addon for addon in self.cat._pipeline.iter_addons() From cabcfee531518803fe9bbc184bcf88b64a03cd24 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 15:07:38 +0100 Subject: [PATCH 74/79] CU-8699049kf: Fix MetaCAT related notebooks --- .../meta_annotation_training.ipynb | 14 +++++--------- .../meta_annotation_training_advanced.ipynb | 14 +++++--------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb index e54254d..68c4f91 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb @@ -144,13 +144,9 @@ "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", "# to the relevant Entity/Span and Document implementation\n", "# we'll use the regex tokenizer here for example since it's easier to initialise\n", - "# but you can use a spacy-based one, you just need to also pass:\n", - "# - the model name (e.g 'en_core_web_md')\n", - "# - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n", - "# 'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n", - "# - whether diacritics should be used\n", - "# - max document length (e.g 1_000_000)\n", - "base_tokenizer = create_tokenizer(\"regex\")\n", + "# but you can use a spacy-based one, you just need to also pass the appropriate config\n", + "from medcat.config import Config\n", + "base_tokenizer = create_tokenizer(\"regex\", Config())\n", "for meta_model in meta_model_names:\n", " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", " if model_is_legacy:\n", @@ -228,7 +224,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv_v2", + "display_name": "venv_v2_311", "language": "python", "name": "python3" }, @@ -242,7 +238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.12" } }, "nbformat": 4, diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb index c9abe82..d7bf05b 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb @@ -117,13 +117,9 @@ "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", "# to the relevant Entity/Span and Document implementation\n", "# we'll use the regex tokenizer here for example since it's easier to initialise\n", - "# but you can use a spacy-based one, you just need to also pass:\n", - "# - the model name (e.g 'en_core_web_md')\n", - "# - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n", - "# 'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n", - "# - whether diacritics should be used\n", - "# - max document length (e.g 1_000_000)\n", - "base_tokenizer = create_tokenizer(\"regex\")" + "# but you can use a spacy-based one, you just need to also pass the appropraite config\n", + "from medcat.config import Config\n", + "base_tokenizer = create_tokenizer(\"regex\", Config())" ] }, { @@ -339,7 +335,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv_v2_311", "language": "python", "name": "python3" }, @@ -353,7 +349,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.11.12" } }, "nbformat": 4, From 95911cc9a8063c73069142e4a9762bdb4ad8f14d Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 15:45:43 +0100 Subject: [PATCH 75/79] CU-8699049kf: Update MetaCAT notebooks with overhaul by Shubham --- .../meta_annotation_training.ipynb | 337 ++++++++++++------ .../meta_annotation_training_advanced.ipynb | 279 +++++---------- 2 files changed, 311 insertions(+), 305 deletions(-) diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb index 68c4f91..d0ed2f0 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb @@ -11,19 +11,14 @@ "import os\n", "from datetime import date\n", "from medcat.cat import CAT\n", - "from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon\n", + "from medcat.components.addons.meta_cat import MetaCAT, MetaCATAddon\n", "from medcat.config.config_meta_cat import ConfigMetaCAT\n", - "from medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer import TokenizerWrapperBPE\n", - "from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import TokenizerWrapperBERT\n", - "from medcat.utils.legacy.identifier import is_legacy_model_pack\n", - "from medcat.storage.serialisers import deserialise\n", - "from medcat.tokenizing.tokenizers import create_tokenizer\n", - "from tokenizers import ByteLevelBPETokenizer" + "from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBERT" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "ca80af0e", "metadata": {}, "outputs": [], @@ -35,10 +30,10 @@ }, { "cell_type": "markdown", - "id": "5d0606ec", + "id": "f310cef3", "metadata": {}, "source": [ - "# Set parameters" + "### Load the model pack with MetaCATs\n" ] }, { @@ -48,82 +43,226 @@ "metadata": {}, "outputs": [], "source": [ - "# relative path to working_with_cogstack folder\n", - "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n", - "# absolute path to working_with_cogstack folder\n", - "base_path = os.path.abspath(_rel_path)\n", - "# Load mct export\n", - "ann_dir = os.path.join(base_path, \"data\", \"medcattrainer_export\")\n", - "\n", - "mctrainer_export_path = ann_dir + \"\" # name of your mct export\n", - "\n", + "model_pack = '' # .zip model pack location \n", + "mctrainer_export = \"\" # name of your mct export" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "921d5e9e", + "metadata": {}, + "outputs": [], + "source": [ "# Load model\n", - "model_dir = os.path.join(base_path, \"models\", \"modelpack\")\n", - "modelpack = '' # name of modelpack\n", - "model_pack_path = os.path.join(model_dir, modelpack)\n", - " #output_modelpack = model_dir + f\"{today}_trained_model\"\n", + "cat = CAT.load_model_pack(model_pack)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b205d51b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are: 3 meta cat models in this model pack.\n" + ] + } + ], + "source": [ + "def get_meta_cats(cat: CAT) -> list[MetaCATAddon]:\n", + " return [\n", + " addon for addon in cat._pipeline.iter_addons()\n", + " if isinstance(addon, MetaCATAddon)\n", + " ]\n", + "meta_cats = get_meta_cats(cat)\n", + "# Check what meta cat models are in this model pack.\n", + "print(f'There are: {len(meta_cats)} meta cat models in this model pack.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31d7632a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Category Name\": \"Temporality\",\n", + " \"Description\": \"No description\",\n", + " \"Classes\": {\n", + " \"Past\": 0,\n", + " \"Recent\": 1,\n", + " \"Future\": 2\n", + " },\n", + " \"Model\": \"bert\"\n", + "}\n" + ] + } + ], + "source": [ + "print(meta_cats[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9180c4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Category Name\": \"Presence\",\n", + " \"Description\": \"No description\",\n", + " \"Classes\": {\n", + " \"Hypothetical (N/A)\": 1,\n", + " \"Not present (False)\": 0,\n", + " \"Present (True)\": 2\n", + " },\n", + " \"Model\": \"bert\"\n", + "}\n" + ] + } + ], + "source": [ + "print(meta_cats[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "275ca9ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"Category Name\": \"Experiencer\",\n", + " \"Description\": \"No description\",\n", + " \"Classes\": {\n", + " \"Family\": 1,\n", + " \"Other\": 0,\n", + " \"Patient\": 2\n", + " },\n", + " \"Model\": \"bert\"\n", + "}\n" + ] + } + ], + "source": [ + "print(meta_cats[2])" + ] + }, + { + "cell_type": "markdown", + "id": "3047b1d9", + "metadata": {}, + "source": [ + " NOTE: \n", + " The name for the classification task can vary. E.g: The Category Name for 'Experiencer' can be 'Subject', as it has been configured an annoated in MedCATTrainer this way, but the model expects 'Experiencer'\n", + " \n", + " To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n", "\n", - "# will be used to date the trained model\n", - "today = str(date.today())\n", - "today = today.replace(\"-\",\"\")\n", + "E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']\n", "\n", - "# Initialise meta_ann models\n", - "if model_pack_path[-4:] == '.zip':\n", - " base_dir_meta_models = model_pack_path[:-4]\n", - "else:\n", - " base_dir_meta_models = model_pack_path\n", + "Set this list to ensure during training / fine-tuning the model is aware of alternative names for classes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ca00fb0", + "metadata": {}, + "outputs": [], + "source": [ + "print(meta_cats[0].config.general.alternative_category_names)" + ] + }, + { + "cell_type": "markdown", + "id": "5dba296c", + "metadata": {}, + "source": [ + "💡 In case you are using older modelpacks, the above field will be empty. In that case, " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92e41964", + "metadata": {}, + "outputs": [], + "source": [ + "# Only run in case the above output is an empty list\n", + "category_name_mapping = [[\"Presence\"],[\"Temporality\",\"Time\"],[\"Experiencer\",\"Subject\"]]\n", + "lookup = {item: group for group in category_name_mapping for item in group}\n", "\n", - "# Iterate through the meta_models contained in the model\n", - "meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n", - "model_is_legacy = is_legacy_model_pack(base_dir_meta_models)\n", - "if model_is_legacy:\n", - " # NOTE: when loaded, will be auto-converted\n", - " exp_start = \"meta_\"\n", - " config_path = [\"config.json\"]\n", - "else:\n", - " exp_start = \"addon_meta_cat\"\n", - " base_dir_meta_models = os.path.join(base_dir_meta_models, \"saved_components\")\n", - " config_path = [\"meta_cat\", \"config\"]\n", - "for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n", - " for dirname in dirnames:\n", - " if dirname.startswith(exp_start):\n", - " meta_model_names.append(dirname[len(exp_start):])" + "for meta_model in range(len(meta_cats)):\n", + " meta_cats[meta_model].config.general.alternative_category_names = lookup.get(meta_cats[meta_model].config.general.category_name)" ] }, { "cell_type": "markdown", - "id": "35aa5605", + "id": "12e91f77", "metadata": {}, "source": [ - "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n", - "\n" + " NOTE: \n", + " The name for the classes can vary too. Some sites may have trained a MetaCAT model for the same task, but called a class value a slightly different name.\n", + " \n", + " E.g: For the Presence task, the class name can be 'Not present (False)' or 'False'\n", + " \n", + " To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n", + "\n", + " E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f6b06e2", + "metadata": {}, + "outputs": [], + "source": [ + "print(meta_cats[0].config.general.alternative_class_names)" ] }, { "cell_type": "markdown", - "id": "8bf6f5c3", + "id": "3c97c986", "metadata": {}, "source": [ - "Depending on the model pack you have, please run the LSTM model or BERT model section.
\n", - "If you are unsure, use this section to check the model type." + "💡 In case you are using older modelpacks, the above field will be empty. In that case, please run the following code:" ] }, { "cell_type": "code", "execution_count": null, - "id": "2933f7e1", + "id": "0fdfae70", "metadata": {}, "outputs": [], "source": [ - "for meta_model in meta_model_names:\n", - " config_path = os.path.join(base_dir_meta_models, exp_start + meta_model, *config_path)\n", - " if model_is_legacy:\n", - " with open(config_path, 'r') as jfile:\n", - " config_dict = json.load(jfile)\n", - " print(f\"Model used for meta_{meta_model}:\", config_dict['model']['model_name'])\n", - " else:\n", - " cnf: ConfigMetaCAT = deserialise(config_path)\n", - " print(f\"Model used for meta_{meta_model}:\", config_dict.model.model_name)" + "# Only run in case the above output is an empty list\n", + "class_name_mapping = {\n", + " \"Temporality\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n", + " \"Time\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n", + " \"Experiencer\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n", + " \"Subject\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n", + " \"Presence\": [[\"Hypothetical (N/A)\", \"Hypothetical\"], [\"Not present (False)\", \"False\"], [\"Present (True)\", \"True\"]]\n", + "}\n", + "\n", + "for meta_model in range(len(meta_cats)):\n", + " meta_cats[meta_model].config.general.alternative_class_names = class_name_mapping[meta_cats[meta_model].config.general.category_name]" ] }, { @@ -141,39 +280,22 @@ "metadata": {}, "outputs": [], "source": [ - "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", - "# to the relevant Entity/Span and Document implementation\n", - "# we'll use the regex tokenizer here for example since it's easier to initialise\n", - "# but you can use a spacy-based one, you just need to also pass the appropriate config\n", - "from medcat.config import Config\n", - "base_tokenizer = create_tokenizer(\"regex\", Config())\n", - "for meta_model in meta_model_names:\n", - " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", - " if model_is_legacy:\n", - " from medcat.utils.legacy.convert_meta_cat import get_meta_cat_from_old\n", - " meta_cat: MetaCATAddon = get_meta_cat_from_old(meta_cat_path, base_tokenizer)\n", - " else:\n", - " # NOTE: the expected workflow when loading the model\n", - " # is one where the config is stored as part of the overall config\n", - " # and thus using it for loading is trivial\n", - " # but here we need to manually load the config from disk\n", - " cnf_path = os.path.join(meta_cat_path, \"config\")\n", - " cnf: ConfigMetaCAT = deserialise(cnf_path)\n", - " # load the meta_model\n", - " meta_cat = MetaCATAddon.load_existing(cnf, base_tokenizer, os.path.join(base_dir_meta_models, exp_start + meta_model))\n", - " mc = meta_cat.mc\n", + "# Train the first meta cat model - 'Temporality' Task.\n", + "meta_cat: MetaCATAddon = meta_cats[0]\n", "\n", - " # changing parameters\n", - " mc.config.train.nepochs = 15\n", + "# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n", + "meta_cat_task = meta_cat.config.general.category_name\n", + "model_pack_dir = ''\n", + "save_dir_path = os.path.join(model_pack_dir,\"meta_\"+ meta_cat_task)\n", "\n", - " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n", - " #Ideally this should replace the meta_models inside the modelpack\n", + "# to save the new model elsewhere, uncomment the below line\n", + "#save_dir_path= \"test_meta_\"+meta_cat_task # Where to save the meta_model and results. \n", "\n", - " # train the meta_model\n", - " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", - " \n", - " # Save results\n", - " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" + "# train the meta_model\n", + "results = meta_cat.mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", + "\n", + "# Save results\n", + "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_cat_task+'_results.json'), 'w'))" ] }, { @@ -181,7 +303,8 @@ "id": "ab23e424", "metadata": {}, "source": [ - "## If you dont have the model packs, and are training from scratch" + "## If you dont have the model packs, and are training from scratch\n", + "⚠️This is very rare, it is recommended to always use the model packs and then fine-tune them" ] }, { @@ -197,36 +320,26 @@ "# config.general['category_name']\n", "\n", "# change model name if training BERT for the first time\n", - "config.model.model_name = 'bert'\n", + "config.model['model_name'] = 'bert'\n", "\n", - "save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n", - "#Ideally this should replace the meta_models inside the modelpack\n", + "tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n", "\n", - "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", - "# to the relevant Entity/Span and Document implementation\n", - "# we'll use the regex tokenizer here for example since it's easier to initialise\n", - "# but you can use a spacy-based one, you just need to also pass:\n", - "# - the model name (e.g 'en_core_web_md')\n", - "# - the names of the disabled components (e.g ['ner', 'parser', 'vectors', 'textcat',\n", - "# 'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens'])\n", - "# - whether diacritics should be used\n", - "# - max document length (e.g 1_000_000)\n", - "base_tokenizer = create_tokenizer(\"regex\")\n", + "save_dir_path= \"test_meta_\" + meta_cat_task # Where to save the meta_model and results. \n", "\n", "# Initialise and train meta_model\n", - "mc = MetaCATAddon.create_new(config, base_tokenizer)\n", - "results = mc.mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", + "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n", + "results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", "\n", "# Save results\n", - "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" + "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_' + meta_cat_task+'_results.json'), 'w'))" ] } ], "metadata": { "kernelspec": { - "display_name": "venv_v2_311", + "display_name": "Python [conda env:cattrainer]", "language": "python", - "name": "python3" + "name": "conda-env-cattrainer-py" }, "language_info": { "codemirror_mode": { @@ -238,7 +351,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.12" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb index d7bf05b..a94852d 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb @@ -1,13 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "ae1fe3b4", - "metadata": {}, - "source": [ - "### This notebook is an advanced tutorial detailing the config changes for optimising the BERT and LSTM models for Experiencer classification task on custom dataset" - ] - }, { "cell_type": "code", "execution_count": null, @@ -19,14 +11,9 @@ "import os\n", "from datetime import date\n", "from medcat.cat import CAT\n", - "from medcat.components.addons.meta_cat.meta_cat import MetaCATAddon, MetaCAT\n", + "from medcat.components.addons.meta_cat import MetaCAT, MetaCATAddon\n", "from medcat.config.config_meta_cat import ConfigMetaCAT\n", - "from medcat.components.addons.meta_cat.mctokenizers.bpe_tokenizer import TokenizerWrapperBPE\n", - "from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import TokenizerWrapperBERT\n", - "from medcat.utils.legacy.identifier import is_legacy_model_pack\n", - "from medcat.storage.serialisers import deserialise\n", - "from medcat.tokenizing.tokenizers import create_tokenizer\n", - "from tokenizers import ByteLevelBPETokenizer" + "from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBERT" ] }, { @@ -38,158 +25,91 @@ "source": [ "# if you want to enable info level logging\n", "import logging\n", - "logging.basicConfig(level=logging.INFO,force=True)\n", - "logger = logging.getLogger(__name__)" + "logging.basicConfig(level=logging.INFO,force=True)" ] }, { "cell_type": "markdown", - "id": "5d0606ec", + "id": "b1c5b9b0", "metadata": {}, "source": [ - "# Set parameters" + "#### 💡 To understand the model loading and other functionalities, please refer to the 'meta_annotation_training.ipynb' notebook" ] }, { "cell_type": "code", - "execution_count": null, - "id": "dd7a2e97", + "execution_count": 3, + "id": "a2c0431f", "metadata": {}, "outputs": [], "source": [ - "# relative path to working_with_cogstack folder\n", - "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n", - "# absolute path to working_with_cogstack folder\n", - "base_path = os.path.abspath(_rel_path)\n", - "# Load mct export\n", - "ann_dir = os.path.join(base_path, \"data\", \"medcattrainer_export\")\n", - "\n", - "mctrainer_export_path = ann_dir + \"\" # name of your mct export\n", - "\n", - "# Load model\n", - "model_dir = os.path.join(base_path, \"models\", \"modelpack\")\n", - "modelpack = '' # name of modelpack\n", - "model_pack_path = os.path.join(model_dir, modelpack)\n", - " #output_modelpack = model_dir + f\"{today}_trained_model\"\n", - "\n", - "# will be used to date the trained model\n", - "today = str(date.today())\n", - "today = today.replace(\"-\",\"\")\n", - "\n", - "# Initialise meta_ann models\n", - "if model_pack_path[-4:] == '.zip':\n", - " base_dir_meta_models = model_pack_path[:-4]\n", - "else:\n", - " base_dir_meta_models = model_pack_path\n", - "\n", - "# Iterate through the meta_models contained in the model\n", - "meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n", - "model_is_legacy = is_legacy_model_pack(base_dir_meta_models)\n", - "if model_is_legacy:\n", - " # NOTE: when loaded, will be auto-converted\n", - " exp_start = \"meta_\"\n", - " config_path = [\"config.json\"]\n", - "else:\n", - " exp_start = \"addon_meta_cat\"\n", - " base_dir_meta_models = os.path.join(base_dir_meta_models, \"saved_components\")\n", - " config_path = [\"meta_cat\", \"config\"]\n", - "for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n", - " for dirname in dirnames:\n", - " if dirname.startswith(exp_start):\n", - " meta_model_names.append(dirname[len(exp_start):])" + "model_pack = '' # .zip model pack location\n", + "mctrainer_export = \"\" # name of your mct export" ] }, { "cell_type": "markdown", - "id": "0b763d35", + "id": "808c27c1", "metadata": {}, "source": [ - "Run this before continuing." + "We won't load the models at this stage as they need to be seperately loaded later.
Let's check for meta models in the directory" ] }, { "cell_type": "code", - "execution_count": null, - "id": "08f8d879", + "execution_count": 4, + "id": "675eab49", "metadata": {}, "outputs": [], "source": [ - "# NOTE: we need to provide a BaseTokenizer to add the relevant additional data paths\n", - "# to the relevant Entity/Span and Document implementation\n", - "# we'll use the regex tokenizer here for example since it's easier to initialise\n", - "# but you can use a spacy-based one, you just need to also pass the appropraite config\n", - "from medcat.config import Config\n", - "base_tokenizer = create_tokenizer(\"regex\", Config())" - ] - }, - { - "cell_type": "markdown", - "id": "35aa5605", - "metadata": {}, - "source": [ - "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "699be74b", - "metadata": {}, - "source": [ - "# Class weights " + "# Iterate through the meta_models contained in the model\n", + "meta_model_names = []\n", + "for dirpath, dirnames, filenames in os.walk(model_pack):\n", + " for dirname in dirnames:\n", + " if dirname.startswith('meta_'):\n", + " meta_model_names.append(dirname[5:])\n", + "\n", + "print(\"Meta models:\",meta_model_names)" ] }, { "cell_type": "markdown", - "id": "e624d876", + "id": "9e499198", "metadata": {}, "source": [ + "# Class weights \n", + "\n", "Adjusting class weights to give more importance to specific classes. Generally, class weights are used in favour of minority classes(classes with less number of samples) to boost their performance.\n", "

To use class weights, we have 2 options:\n", "
1. calculate class weights based on class distribution\n", - "
2. using specified class weights" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc91f7d6", - "metadata": {}, - "outputs": [], - "source": [ - "mc: MetaCAT\n", - "#option 1\n", - "mc.config.train['class_weights'] = []\n", - "mc.config.train['compute_class_weights'] = True\n", - "#NOTE: this will only be applicable if mc.config.train.class_weights is empty\n", - "#2nd option\n", - "#using specified class weights\n", - "mc.config.train['class_weights'] = [0.4,0.3,0.1]" + "
2. using specified class weights\n", + "\n", + "\n", + "#option 1
\n", + "metacat.config.train['class_weights'] = []
\n", + "metacat.config.train['compute_class_weights'] = True
\n", + "
\n", + "#option 2
\n", + "metacat.config.train['class_weights'] = [0.4,0.3,0.1]
" ] }, { "cell_type": "markdown", - "id": "c217762f", + "id": "fc07f3e9", "metadata": {}, "source": [ - "NOTE: Make sure to correctly map the class weights to their corresponding class index (ID).
To check the index assigned to the classes, use:
`print(mc.config.general.category_value2id)`\n", + "NOTE: Make sure to correctly map the class weights to their corresponding class index.
To check the index assigned to the classes, use:
`print(mc.config.general['category_value2id'])`\n", "
This will print a dictionary where the class names and their corresponding IDs (indices) are displayed.
\n", "The first position in the class weight list corresponds to the class with ID 0 in the dictionary, and so on." ] }, { "cell_type": "markdown", - "id": "c3002ef0", - "metadata": {}, - "source": [ - "# 2 phase learning for training" - ] - }, - { - "cell_type": "markdown", - "id": "a349af2b", + "id": "6a92aa60", "metadata": {}, "source": [ + "# 2 phase learning for training\n", + "\n", "2 phase learning is used to mitigate class imbalance. In 2 phase learning, the models are trained twice:
\n", "Phase 1: trains for minority class(es) by undersampling data so that there is no class imbalance\n", "
Phase 2: trains for all classes\n", @@ -198,109 +118,87 @@ "
Phase 2 is when the model is expected to learn the majority class as it is trained on the entire dataset.\n", "\n", "Paper reference - https://ieeexplore.ieee.org/document/7533053\n", - "
NOTE: Make sure to use class weights in favour of minority classes with 2 phase learning" + "
Make sure to use class weights in favour of minority classes with 2 phase learning" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8ff613ef", + "execution_count": 5, + "id": "5a86b839", "metadata": {}, "outputs": [], "source": [ - "def load_meta_cat_from_file(meta_cat_path: str, config_name: str = 'config.json') -> MetaCATAddon:\n", - " config_path = os.path.join(meta_cat_path, config_name)\n", - " with open(config_path) as f:\n", - " config_dict = json.load(f)\n", - "\n", - " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", - " if model_is_legacy:\n", - " from medcat.utils.legacy.convert_meta_cat import get_meta_cat_from_old\n", - " meta_cat: MetaCATAddon = get_meta_cat_from_old(meta_cat_path, base_tokenizer)\n", - " else:\n", - " # NOTE: the expected workflow when loading the model\n", - " # is one where the config is stored as part of the overall config\n", - " # and thus using it for loading is trivial\n", - " # but here we need to manually load the config from disk\n", - " cnf_path = os.path.join(meta_cat_path, \"config\")\n", - " cnf: ConfigMetaCAT = deserialise(cnf_path)\n", - " # load the meta_model\n", - " meta_cat = MetaCATAddon.load_existing(cnf, base_tokenizer, os.path.join(base_dir_meta_models, exp_start + meta_model))\n", - " return meta_cat\n", "#--------------------------------Phase 1--------------------------------\n", - "def run_phase_1(meta_model, class_wt_phase1 = None):\n", + "def run_phase_1(meta_model,class_wt_phase1 = None):\n", " #Loading the pre-defined config for phase 1\n", - " # NOTE: the original (v1) version contained loading a different config\n", - " # for each phase, but I do not know how these files would have been saved there\n", - " # and thus don't know what the indent was\n", - " # config_ph_1_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph1.json\")\n", - " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", - " meta_cat = load_meta_cat_from_file(meta_cat_path)\n", - " mc = meta_cat.mc\n", + " config_ph_1_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph1.json\")\n", + " with open(config_ph_1_path) as f:\n", + " config_ph1 = json.load(f)\n", + " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph1)\n", "\n", " if class_wt_phase1:\n", - " mc.config.train.class_weights = class_wt_phase1\n", + " mc.config.train['class_weights'] = class_wt_phase1\n", "\n", - " mc.config.train.nepochs = 30 #You can change the number of epochs, remember to keep them higher for phase 1\n", + " #You can change the number of epochs, remember to keep them higher for phase 1\n", + " mc.config.train['nepochs'] = 40 \n", "\n", - " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n", - " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", + " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", " # Save results\n", " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n", "\n", "#--------------------------------Phase 2--------------------------------\n", - "def run_phase_2(meta_model, class_wt_phase2 = None): \n", + "def run_phase_2(meta_model,class_wt_phase2 = None): \n", " #Loading the pre-defined config for phase 2\n", - " # NOTE: the original (v1) version contained loading a different config\n", - " # for each phase, but I do not know how these files would have been saved there\n", - " # and thus don't know what the indent was\n", - " # config_ph_2_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config_ph2.json\")\n", - " meta_cat_path = os.path.join(base_dir_meta_models, exp_start + meta_model)\n", - " meta_cat = load_meta_cat_from_file(meta_cat_path)\n", - " mc = meta_cat.mc\n", + " config_ph_2_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph2.json\")\n", + " with open(config_ph_2_path) as f:\n", + " config_ph2 = json.load(f)\n", + "\n", + " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph2)\n", "\n", " if class_wt_phase2:\n", - " mc.config.train.class_weights = class_wt_phase2\n", + " mc.config.train['class_weights'] = class_wt_phase2\n", "\n", - " mc.config.train.nepochs = 15\n", + " #You can change the number of epochs\n", + " mc.config.train['nepochs'] = 20\n", "\n", - " save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n", - " results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n", + " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", " # Save results\n", " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase2.json'), 'w'))\n", "\n", "#--------------------------------Driver--------------------------------\n", - "for meta_model in meta_model_names:\n", - " #To use your own class weights instead of the pre-defined ones for the 2 phases, uncomment the below lines\n", - " '''class_wt_phase1 = []\n", - " class_wt_phase2 = []'''\n", + "# Train the first meta cat model\n", + "meta_model = meta_model_names[0]\n", "\n", - " # Train 2 phase learning\n", - " logger.info(\"\\n********************Beginning Phase 1********************\")\n", - " run_phase_1(meta_model,class_wt_phase1)\n", - " logger.info(\"\\n********************Beginning Phase 2********************\")\n", - " run_phase_2(meta_model,class_wt_phase2)" - ] - }, - { - "cell_type": "markdown", - "id": "b3d43a3b", - "metadata": {}, - "source": [ - "# Oversampling data" + "# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n", + "meta_cat_task = meta_model\n", + "save_dir_path = os.path.join(model_pack,\"meta_\"+ meta_cat_task)\n", + "\n", + "# To use your own class weights instead of the pre-defined ones for the 2 phases, put the weights in the lists below\n", + "class_wt_phase1 = [] # Example [0.4,0.4,0.2]\n", + "class_wt_phase2 = [] # Example [0.4,0.3,0.3]\n", + "\n", + "\n", + "# Train 2 phase learning\n", + "print(\"*** Training meta cat: \",meta_model)\n", + "print(\"Beginning Phase 1...\")\n", + "run_phase_1(meta_model,class_wt_phase1)\n", + "print(\"Beginning Phase 2...\")\n", + "run_phase_2(meta_model,class_wt_phase2)" ] }, { "cell_type": "markdown", - "id": "ca9b70b3", + "id": "60f0e878", "metadata": {}, "source": [ + "# Generating synthetic data\n", + "\n", "You can generate synthetic data to help mitigate class imbalance.
Use this code to generate synthetic data using LLM - [link](https://gist.github.com/shubham-s-agarwal/401ef8bf6cbbd66fa0c76a8fbfc1f6c4)
NOTE: the generated data will require manual quality check to ensure that high quality and relevant data is used for training. " ] }, { "cell_type": "markdown", - "id": "5835eb2b", + "id": "431e1002", "metadata": {}, "source": [ "The data generated from the gist code and the format of the data required by MedCAT are different, requiring manual formatting at the moment. We will update this module to include the code to handle the same." @@ -309,24 +207,19 @@ { "cell_type": "code", "execution_count": null, - "id": "8161b602", + "id": "4d07d437", "metadata": {}, "outputs": [], "source": [ "# To run the training with original + synthetic data\n", - "# Follow all the same steps till initializing the metacat model\n", - "\n", - "config = ConfigMetaCAT()\n", - "\n", - "# Initialise and train meta_model\n", - "mc = MetaCATAddon.create_new(config, base_tokenizer)\n", + "# Follow all the same steps till and load the model\n", "\n", "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n", "# ['text','of','the','document'], [index of medical entity], \"label\" ]]\n", "\n", "synthetic_data_export = [[],[],[]]\n", "\n", - "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)\n", + "results = meta_model.train_from_json(mctrainer_export, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)\n", "\n", "# Save results\n", "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" @@ -335,7 +228,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv_v2_311", + "display_name": "pytorch_medcat_clean", "language": "python", "name": "python3" }, @@ -349,7 +242,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.12" + "version": "3.10.14" } }, "nbformat": 4, From 133e047eebf97c455a33506fd64a8ddc23305ecd Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 16 Jul 2025 15:51:15 +0100 Subject: [PATCH 76/79] CU-8699049kf: Fix a few simple import issues --- .../2_supervised_training/meta_annotation_training.ipynb | 2 +- .../meta_annotation_training_advanced.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb index d0ed2f0..af5d206 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb @@ -13,7 +13,7 @@ "from medcat.cat import CAT\n", "from medcat.components.addons.meta_cat import MetaCAT, MetaCATAddon\n", "from medcat.config.config_meta_cat import ConfigMetaCAT\n", - "from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBERT" + "from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import TokenizerWrapperBERT" ] }, { diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb index a94852d..fb85f6c 100644 --- a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb +++ b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb @@ -13,7 +13,7 @@ "from medcat.cat import CAT\n", "from medcat.components.addons.meta_cat import MetaCAT, MetaCATAddon\n", "from medcat.config.config_meta_cat import ConfigMetaCAT\n", - "from medcat.components.addons.meta_cat.mctokenizers.tokenizers import TokenizerWrapperBERT" + "from medcat.components.addons.meta_cat.mctokenizers.bert_tokenizer import TokenizerWrapperBERT" ] }, { From 298ede03bdc2ef64496b8d2b173c07c09c9d6817 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 17 Jul 2025 15:54:10 +0100 Subject: [PATCH 77/79] CU-8699049kf: Add a few more comments regarding get_entities_multi_texts output --- medcat/3_run_model/run_model.ipynb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb index 18352b0..d4e96d4 100755 --- a/medcat/3_run_model/run_model.ipynb +++ b/medcat/3_run_model/run_model.ipynb @@ -156,13 +156,18 @@ "source": [ "batch_char_size = 50000 # Batch size (BS) in number of characters\n", "for text_id, text in data_iterator(df, doc_id_column, doc_text_column):\n", - " cat.get_entities(text,\n", + " # NOTE: get_entities_multi_text returns an iterator\n", + " # so no work gets done until the iterator use materialised\n", + " output = cat.get_entities_multi_texts(text,\n", " only_cui=False,\n", " # nproc=8, # Number of processors\n", " # out_split_size_chars=20*batch_char_size,\n", " # save_dir_path=ann_folder_path,\n", " # min_free_memory=0.1,\n", " )\n", + " # so if we're doing a small amount of data and/or not saving it on disk\n", + " # we probably want to just convert it to a list\n", + " output = list(output)\n", "\n", "medcat_logger.warning(f'Annotation process complete!')\n" ] @@ -321,7 +326,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv_v2", "language": "python", "name": "python3" }, @@ -335,12 +340,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "vscode": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - } + "version": "3.10.13" } }, "nbformat": 4, From ff8eed5cf5d9301895f047f18c05bf6e3497aae7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 17 Jul 2025 16:28:01 +0100 Subject: [PATCH 78/79] CU-8699049kf: Allow saving multiproccessing results. Also add comment regarding materialising the output without keeping it all in memory --- medcat/3_run_model/run_model.ipynb | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb index d4e96d4..11cb7b0 100755 --- a/medcat/3_run_model/run_model.ipynb +++ b/medcat/3_run_model/run_model.ipynb @@ -156,18 +156,25 @@ "source": [ "batch_char_size = 50000 # Batch size (BS) in number of characters\n", "for text_id, text in data_iterator(df, doc_id_column, doc_text_column):\n", - " # NOTE: get_entities_multi_text returns an iterator\n", - " # so no work gets done until the iterator use materialised\n", + " # NOTE: get_entities_multi_text returns an generator\n", + " # so no work gets done until the generator use materialised\n", " output = cat.get_entities_multi_texts(text,\n", " only_cui=False,\n", " # nproc=8, # Number of processors\n", " # out_split_size_chars=20*batch_char_size,\n", - " # save_dir_path=ann_folder_path,\n", + " save_dir_path=ann_folder_path,\n", " # min_free_memory=0.1,\n", " )\n", " # so if we're doing a small amount of data and/or not saving it on disk\n", " # we probably want to just convert it to a list\n", " output = list(output)\n", + " # However, if we we're saving the data on disk and don't\n", + " # want to duplicate in memory (i.e there's a lot of data\n", + " # and it can't all be held in memory), we may want to\n", + " # just exhaust the generator\n", + " # NOTE: uncomment to use, but commnet the `ouput = list(ouput)`` line\n", + " # for _ in output:\n", + " # pass\n", "\n", "medcat_logger.warning(f'Annotation process complete!')\n" ] From e68d91b00d4c3be7861a6873e91a5232aed616c7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sat, 19 Jul 2025 08:50:43 +0100 Subject: [PATCH 79/79] CU-8699049kf: Update dependency to latest release --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8824952..a7cc1f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=3.8.0,<4.0 -medcat[meta-cat,spacy,deid,rel-cat]~=2.0.0b3 +medcat[meta-cat,spacy,deid,rel-cat]~=2.0.0b4 plotly~=5.19.0 # eland~=8.18.1 # NOTE: there is no numpy2-compatible eland release as of 2025-05-13 en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl