diff --git a/.github/workflows/medcat-embedding-linker_ci.yml b/.github/workflows/medcat-embedding-linker_ci.yml new file mode 100644 index 000000000..2d23f3909 --- /dev/null +++ b/.github/workflows/medcat-embedding-linker_ci.yml @@ -0,0 +1,109 @@ +name: medcat-embedding-linker - CI (test | publish) + +on: + push: + branches: [ main ] + tags: + - 'medcat-embedding-linker/v*.*.*' + pull_request: + paths: + - 'medcat-embedding-linker/**' + - '.github/workflows/medcat-embedding-linker**' + +permissions: + id-token: write + +defaults: + run: + working-directory: ./medcat-plugins/embedding-linker + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ '3.10', '3.11', '3.12' ] + max-parallel: 4 + steps: + - uses: actions/checkout@v6 + - name: Install uv for Python ${{ matrix.python-version }} + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.python-version }} + enable-cache: true + - name: Install the project + run: | + uv sync --all-extras --dev + uv run python -m ensurepip + uv run python -m pip install --upgrade pip + - name: Check types + run: | + uv run python -m mypy --follow-imports=normal src/medcat_embedding_linker + - name: Ruff linting + run: | + uv run ruff check src/medcat_embedding_linker --preview + - name: Test + run: | + uv run python -m unittest discover + + publish-to-test-PyPI: + runs-on: ubuntu-latest + needs: build + steps: + - name: Checkout main + uses: actions/checkout@v6 + with: + fetch-depth: 0 # fetch all history + fetch-tags: true # fetch tags explicitly + + - name: Install uv for Python 3.10 + uses: astral-sh/setup-uv@v7 + with: + python-version: '3.10' + enable-cache: true + + - name: Install dependencies + run: | + uv run python -m ensurepip + + - name: Set timestamp-based dev version + run: | + TS=$(date -u +"%Y%m%d%H%M%S") + echo "SETUPTOOLS_SCM_PRETEND_VERSION_FOR_MEDCAT_EMBEDDING_LINKER=0.2.2.dev${TS}" >> $GITHUB_ENV + + - name: Build package + run: | + uv build + + - name: Publish distribution to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository_url: https://test.pypi.org/legacy/ + packages_dir: medcat-plugins/embedding-linker/dist + + publish-to-PyPI: + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + needs: build + steps: + - name: Checkout main + uses: actions/checkout@v6 + + - name: Install uv for Python 3.10 + uses: astral-sh/setup-uv@v7 + with: + python-version: '3.10' + enable-cache: true + + - name: Install dependencies + run: | + uv run python -m ensurepip + + - name: Build client package + run: | + uv build + + - name: Publish production distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages_dir: medcat-plugins/embedding-linker/dist diff --git a/medcat-plugins/embedding-linker/README.md b/medcat-plugins/embedding-linker/README.md new file mode 100644 index 000000000..4a812a645 --- /dev/null +++ b/medcat-plugins/embedding-linker/README.md @@ -0,0 +1,192 @@ +# MedCAT Embedding Linker + +A MedCAT plugin that provides an embedding-based entity linking component using transformer models from HuggingFace. + +## Overview + +This plugin replaces MedCAT's default linking component with a transformer-based approach that uses semantic similarity between entity contexts and concept embeddings to perform entity disambiguation. + +**Key features:** +- Semantic similarity-based linking using transformer embeddings +- Support for any HuggingFace sentence-transformer model +- Efficient batch processing with GPU acceleration +- Configurable similarity thresholds and context windows +- CUI-based filtering (include/exclude lists) + +## Requirements + +- **MedCAT**: 2.0+ ([PyPI](https://pypi.org/project/medcat/) | [GitHub](https://github.com/CogStack/MedCAT)) +- Python 3.10+ +- PyTorch +- Transformers + +## Installation + +```bash +pip install medcat-embedding-linker +``` + +## Quick Start + +```python +from medcat.cat import CAT +from medcat.config import Config +from medcat.components.types import CoreComponentType + +from medcat_embedding_linker import EmbeddingLinking + +# Load your MedCAT model +cat = CAT.load_model_pack("path/to/model_pack") + +# Configure the embedding linker +cat.config.components.linking = EmbeddingLinking() +cat.config.components.linking.embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2" + +# Recreate the pipeline to register the new linker +cat._recreate_pipe() + +# Generate embeddings for your concept database +linker = self.get_component(CoreComponentType.linking) +# create +linker.create_embeddings() + +# Use as normal +entities = cat.get_entities("Patient presents with chest pain and dyspnea.") +``` + +## How It Works + +### Component Registration + +The embedding linker automatically registers itself as `embedding_linker` when `EmbeddingLinking` config is detected. It implements MedCAT's `AbstractEntityProvidingComponent` interface and is lazily loaded when the pipeline is created. + +### Embedding Generation + +The linker operates on two types of embeddings: + +**1. Concept Embeddings** (pre-computed) +- Each CUI is represented by its longest name's embedding +- Stored in `cdb.addl_info["cui_embeddings"]` +- Used for final disambiguation between candidate CUIs + +**2. Name Embeddings** (pre-computed) +- Each concept name in the CDB gets its own embedding +- Stored in `cdb.addl_info["name_embeddings"]` +- Used for initial candidate retrieval + +Both are generated via `linker.create_embeddings()` and cached for inference. + +### Inference Process + +For each detected entity: + +1. **Context Vector Calculation**: Extract a text snippet around the entity (size controlled by `context_window_size`) and embed it +2. **Candidate Retrieval**: Compare context embedding against all name embeddings to find top matches above `short_similarity_threshold` +3. **Disambiguation**: If multiple CUIs are associated with the best-matching name, compare against CUI embeddings to select the final concept +4. **Filtering**: Apply CUI include/exclude filters and check against `long_similarity_threshold` + +## Configuration + +### Key Parameters + +```python +config.components.linking = EmbeddingLinking( + # Model settings + embedding_model_name="sentence-transformers/all-MiniLM-L6-v2", + max_token_length=128, + + # Context settings + context_window_size=10, # tokens on each side of entity + + # Similarity thresholds + short_similarity_threshold=0.3, # for candidate retrieval + long_similarity_threshold=0.5, # for final linking + + # Batch sizes + embedding_batch_size=4096, + linking_batch_size=512, + + # Filtering + filters=Filters( + cuis={"C0018802", "C0011849"}, # include only these + cuis_exclude={"C0000001"} # or exclude these + ), + + # Advanced options + use_ner_link_candidates=True, + always_calculate_similarity=False, + filter_before_disamb=True, + gpu_device="cuda:0" # or None for auto-detect +) +``` + +### Embedding Models + +Any HuggingFace model compatible with sentence transformers will work. Popular options: + +- `sentence-transformers/all-MiniLM-L6-v2` (default, fast and lightweight) +- `sentence-transformers/all-mpnet-base-v2` (higher quality) +- `UFNLP/gatortron-medium` (biomedical domain) +- `microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext` + +## Advanced Usage + +### Re-generating Embeddings + +If you modify your CDB or want to try a different model: + +```python +linker = cat.get_component("embedding_linker") +linker.create_embeddings( + embedding_model_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", + max_length=256 +) +``` + +### GPU Configuration + +```python +# Use specific GPU +cat.config.components.linking.gpu_device = "cuda:1" + +# Force CPU +cat.config.components.linking.gpu_device = "cpu" +``` + +### Filtering + +```python +# Include only specific CUIs +cat.config.components.linking.filters.cuis = {"C0011849", "C0018802"} + +# Exclude specific CUIs +cat.config.components.linking.filters.cuis_exclude = {"C0000001"} + +# Note: If both are set, only include filters are applied +``` + +## Performance Considerations + +- **First-time embedding generation**: Can take several minutes for large CDBs (millions of concepts) +- **GPU recommended**: 10-50x faster inference with CUDA +- **Batch sizes**: Increase if you have GPU memory available +- **Model selection**: Smaller models (e.g., MiniLM) are faster but may be less accurate than larger domain-specific models + +## Limitations + +- Does not support `prefer_frequent_concepts` or `prefer_primary_name` from the default linker (logs warnings if set) +- Training mode is not applicable (logs warning if enabled) +- Requires pre-computed embeddings before inference + +## Citation + +If you use this plugin, please cite MedCAT: + +```bibtex +@article{medcat2021, + title={Medical Concept Annotation Tool (MedCAT)}, + author={Kraljevic, Zeljko and et al.}, + journal={arXiv preprint arXiv:2010.01165}, + year={2021} +} +``` diff --git a/medcat-plugins/embedding-linker/pyproject.toml b/medcat-plugins/embedding-linker/pyproject.toml new file mode 100644 index 000000000..be803b4a9 --- /dev/null +++ b/medcat-plugins/embedding-linker/pyproject.toml @@ -0,0 +1,117 @@ +[project] +name = "medcat-embedding-linker" + +dynamic = ["version"] + +description = "Embeddings based linker for MedCAT" + +readme = "README.md" + +requires-python = ">=3.10" + +license = {text = "Apache-2.0"} + +keywords = ["ML", "NLP", "NER+L"] + +authors = [ + {name = "A. Sutton"}, + {name = "T. Searle"}, + {name = "M. Ratas"}, +] + +# This should be your name or the names of the organization who currently +# maintains the project, and a valid email address corresponding to the name +# listed. +maintainers = [ + {name = "CogStack", email = "contact@cogstack.org" } +] + +classifiers = [ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + "Development Status :: 5 - Production/Stable", + + "Intended Audience :: Healthcare Industry", + # "Topic :: Natural Language Processing :: Named Entity Recognition and Linking", + + # Specify the Python versions you support here. In particular, ensure + # that you indicate you support Python 3. These classifiers are *not* + # checked by "pip install". See instead "python_requires" below. + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3 :: Only", + "Operating System :: OS Independent", +] + +# This field lists other packages that your project depends on to run. +# Any package you put here will be installed by pip when your project is +# installed, so they must be valid existing projects. +# +# For an analysis of this field vs pip's requirements files see: +# https://packaging.python.org/discussions/install-requires-vs-requirements/ +dependencies = [ + "medcat[spacy]>=2.5", + "transformers>=4.41.0,<5.0", # avoid major bump + "torch>=2.4.0,<3.0", + "tqdm", +] + +# List additional groups of dependencies here (e.g. development +# dependencies). Users will be able to install these using the "extras" +# syntax, for example: +# +# $ pip install sampleproject[dev] +# +# Similar to `dependencies` above, these must be valid existing +# projects. +[project.optional-dependencies] # Optional +dev = [ + "ruff~=0.1.7", + "mypy", + "types-tqdm", + "types-setuptools", + "types-PyYAML", +] + +# entry-points to add onto medcat +[project.entry-points."medcat.plugins"] +medcat_embedding_linker = "medcat_embedding_linker" + +[project.urls] +"Homepage" = "https://cogstack.org/" +"Bug Reports" = "https://discourse.cogstack.org/" +"Source" = "https://github.com/CogStack/cogstack-nlp/tree/main/medcat-plugins/embed-linker" + +[build-system] +# These are the assumed default build requirements from pip: +# https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support +requires = ["setuptools>=43.0.0", "setuptools_scm>=8", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +"medcat_embed_linker" = ["py.typed"] + +[tool.setuptools_scm] +# look for .git folder in root of repo +root = "../.." +version_scheme = "post-release" +local_scheme = "no-local-version" +tag_regex = "^medcat-embedding-linker/v(?P\\d+(?:\\.\\d+)*)(?:[ab]\\d+|rc\\d+)?$" +git_describe_command = "git describe --dirty --tags --long --match 'medcat-embedding-linker/v*'" + +[tool.ruff.lint] +# 1. Enable some extra checks for ruff +select = ["E", "F"] +# ignore unused local variables +ignore = ["F841"] diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/__init__.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/__init__.py new file mode 100644 index 000000000..1f1d2b174 --- /dev/null +++ b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/__init__.py @@ -0,0 +1,3 @@ +from .registration import do_registration as __register + +__register() diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py new file mode 100644 index 000000000..d5665a1f3 --- /dev/null +++ b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py @@ -0,0 +1,48 @@ +from typing import Optional, Any + +from medcat.config import Linking + + +class EmbeddingLinking(Linking): + """The config exclusively used for the embedding linker""" + comp_name: str = "embedding_linker" + """Changing compoenent name""" + filter_before_disamb: bool = False + """Filtering CUIs before disambiguation""" + train: bool = False + """The embedding linker never needs to be trained in its + current implementation.""" + long_similarity_threshold: float = 0.0 + """Used in the inference step to choose the best CUI given the + link candidates. Testing shows a threshold of 0.7 increases precision + with minimal impact on recall. Default is 0.0 which assumes + all entities detected by the NER step are true.""" + short_similarity_threshold: float = 0.0 + """Used for generating cui candidates. If a threshold of 0.0 + is selected then only the highest scoring name will provide cuis + to be link candidates. Use a threshold of 0.95 or higher, as this is + essentailly string matching and account for spelling errors. Lower + thresholds will provide too many candidates and slow down the inference.""" + embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2" + """Name of the embedding model. It must be downloadable from + huggingface linked from an appropriate file directory""" + max_token_length: int = 64 + """Max number of tokens to be embedded from a name. + If the max token length is changed then the linker will need to be created + with a new config. + """ + embedding_batch_size: int = 4096 + """How many pieces names can be embedded at once, useful when + embedding name2info names, cui2info names""" + linking_batch_size: int = 512 + """How many entities to be linked at once""" + gpu_device: Optional[Any] = None + """Choose a device for the linking model to be stored. If None + then an appropriate GPU device that is available will be chosen""" + context_window_size: int = 14 + """Choose the window size to get context vectors.""" + use_ner_link_candidates: bool = True + """Link candidates are provided by some NER steps. This will flag if + you want to trust them or not.""" + use_similarity_threshold: bool = True + """Do we have a similarity threshold we care about?""" diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/embedding_linker.py similarity index 98% rename from medcat-v2/medcat/components/linking/embedding_linker.py rename to medcat-plugins/embedding-linker/src/medcat_embedding_linker/embedding_linker.py index c0e8b594c..6bb1633c4 100644 --- a/medcat-v2/medcat/components/linking/embedding_linker.py +++ b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/embedding_linker.py @@ -1,5 +1,5 @@ from medcat.cdb import CDB -from medcat.config.config import Config, ComponentConfig, EmbeddingLinking +from medcat.config.config import Config, ComponentConfig from medcat.components.types import CoreComponentType from medcat.components.types import AbstractEntityProvidingComponent from medcat.tokenizing.tokens import MutableEntity, MutableDocument @@ -13,18 +13,12 @@ import math import numpy as np -from medcat.utils.import_utils import ensure_optional_extras_installed -import medcat +from medcat_embedding_linker.config import EmbeddingLinking -# NOTE: the below needs to be before torch/transformers imports -_EXTRA_NAME = "embed-linker" -ensure_optional_extras_installed(medcat.__name__, _EXTRA_NAME) - -# avoid linting issues due to above check -from torch import Tensor # noqa: E402 -from transformers import AutoTokenizer, AutoModel # noqa: E402 -import torch.nn.functional as F # noqa: E402 -import torch # noqa: E402 +from torch import Tensor +from transformers import AutoTokenizer, AutoModel +import torch.nn.functional as F +import torch logger = logging.getLogger(__name__) diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/registration.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/registration.py new file mode 100644 index 000000000..dc4fac3cb --- /dev/null +++ b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/registration.py @@ -0,0 +1,14 @@ + +import logging + +from medcat.components.types import CoreComponentType +from medcat.components.types import lazy_register_core_component + + +logger = logging.getLogger(__name__) + + +def do_registration(): + lazy_register_core_component( + CoreComponentType.linking, "embedding_linker", + "medcat_embedding_linker.embedding_linker", "Linker.create_new_component") diff --git a/medcat-plugins/embedding-linker/tests/__init__.py b/medcat-plugins/embedding-linker/tests/__init__.py new file mode 100644 index 000000000..824839547 --- /dev/null +++ b/medcat-plugins/embedding-linker/tests/__init__.py @@ -0,0 +1,25 @@ +# NOTE: mostly copied from medcat tests +import atexit +import os +import shutil + + +RESOURCES_PATH = os.path.join(os.path.dirname(__file__), "resources") +EXAMPLE_MODEL_PACK_ZIP = os.path.join(RESOURCES_PATH, "mct2_model_pack.zip") +UNPACKED_EXAMPLE_MODEL_PACK_PATH = os.path.join( + RESOURCES_PATH, "mct2_model_pack") + + +# unpack model pack at start so we can access stuff like Vocab +print("Unpacking included test model pack") +shutil.unpack_archive( + EXAMPLE_MODEL_PACK_ZIP, UNPACKED_EXAMPLE_MODEL_PACK_PATH) + + +def _del_unpacked_model(): + print("Cleaning up! Removing unpacked exmaple model pack:", + UNPACKED_EXAMPLE_MODEL_PACK_PATH) + shutil.rmtree(UNPACKED_EXAMPLE_MODEL_PACK_PATH) + + +atexit.register(_del_unpacked_model) diff --git a/medcat-plugins/embedding-linker/tests/helper.py b/medcat-plugins/embedding-linker/tests/helper.py new file mode 100644 index 000000000..9513f3c43 --- /dev/null +++ b/medcat-plugins/embedding-linker/tests/helper.py @@ -0,0 +1,76 @@ +from typing import runtime_checkable, Type, Callable + +from medcat.components import types +from medcat.config.config import Config, ComponentConfig + + +class FakeCDB: + + def __init__(self, cnf: Config): + self.config = cnf + self.token_counts = {} + self.cui2info = {} + self.name2info = {} + + def weighted_average_function(self, v: int) -> float: + return v * 0.5 + + +class FVocab: + pass + + +class FTokenizer: + pass + + +class ComponentInitTests: + expected_def_components = 1 + default = 'default' + # these need to be specified when overriding + comp_type: types.CoreComponentType + default_cls: Type[types.BaseComponent] + default_creator: Callable[..., types.BaseComponent] + + @classmethod + def setUpClass(cls): + cls.cnf = Config() + cls.fcdb = FakeCDB(cls.cnf) + cls.fvocab = FVocab() + cls.vtokenizer = FTokenizer() + cls.comp_cnf: ComponentConfig = getattr( + cls.cnf.components, cls.comp_type.name) + if isinstance(cls.default_creator, Type): + cls._def_creator_name_opts = (cls.default_creator.__name__,) + else: + # classmethod + cls._def_creator_name_opts = (".".join(( + # etiher class.method_name + cls.default_creator.__self__.__name__, + cls.default_creator.__name__)), + # or just method_name + cls.default_creator.__name__ + ) + + def test_has_default(self): + avail_components = types.get_registered_components(self.comp_type) + self.assertEqual(len(avail_components), self.expected_def_components) + name, cls_name = avail_components[0] + # 1 name / cls name + eq_name = [name == self.default for name, _ in avail_components] + eq_cls = [cls_name in self._def_creator_name_opts + for _, cls_name in avail_components] + self.assertEqual(sum(eq_name), 1) + # NOTE: for NER both the default as well as the Dict based NER + # have the came class name, so may be more than 1 + self.assertGreaterEqual(sum(eq_cls), 1) + # needs to have the same class where name is equal + self.assertTrue(eq_cls[eq_name.index(True)]) + + def test_can_create_def_component(self): + component = types.create_core_component( + self.comp_type, + self.default, self.cnf, self.vtokenizer, self.fcdb, self.fvocab, None) + self.assertIsInstance(component, + runtime_checkable(types.BaseComponent)) + self.assertIsInstance(component, self.default_cls) diff --git a/medcat-plugins/embedding-linker/tests/resources/mct2_model_pack.zip b/medcat-plugins/embedding-linker/tests/resources/mct2_model_pack.zip new file mode 100644 index 000000000..b6bc74e49 Binary files /dev/null and b/medcat-plugins/embedding-linker/tests/resources/mct2_model_pack.zip differ diff --git a/medcat-v2/tests/components/linking/test_embedding_linker.py b/medcat-plugins/embedding-linker/tests/test_embedding_linker.py similarity index 90% rename from medcat-v2/tests/components/linking/test_embedding_linker.py rename to medcat-plugins/embedding-linker/tests/test_embedding_linker.py index 658ecc52f..e7923426f 100644 --- a/medcat-v2/tests/components/linking/test_embedding_linker.py +++ b/medcat-plugins/embedding-linker/tests/test_embedding_linker.py @@ -1,4 +1,4 @@ -from medcat.components.linking import embedding_linker +from medcat_embedding_linker import embedding_linker from medcat.components import types from medcat.config import Config from medcat.data.entities import Entity @@ -8,9 +8,9 @@ from medcat.components.types import TrainableComponent from medcat.components.types import _DEFAULT_LINKING as DEF_LINKING import unittest -from ..helper import ComponentInitTests +from .helper import ComponentInitTests -from ... import UNPACKED_EXAMPLE_MODEL_PACK_PATH +from . import UNPACKED_EXAMPLE_MODEL_PACK_PATH class FakeDocument: linked_ents = [] @@ -37,7 +37,7 @@ def weighted_average_function(self, nr: int) -> float: class EmbeddingLinkerInitTests(ComponentInitTests, unittest.TestCase): expected_def_components = len(DEF_LINKING) comp_type = types.CoreComponentType.linking - default = 'medcat2_embedding_linker' + default = 'embedding_linker' default_cls = embedding_linker.Linker default_creator = embedding_linker.Linker.create_new_component module = embedding_linker @@ -55,7 +55,7 @@ def setUpClass(cls): def test_has_default(self): avail_components = types.get_registered_components(self.comp_type) registered_names = [name for name, _ in avail_components] - self.assertIn("medcat2_embedding_linker", registered_names) + self.assertIn("embedding_linker", registered_names) class NonTrainableEmbeddingLinkerTests(unittest.TestCase): cnf = Config() @@ -83,6 +83,10 @@ def setUpClass(cls) -> None: linker: embedding_linker.Linker = cls.model.pipe.get_component( types.CoreComponentType.linking) linker.create_embeddings() + cls.linker = linker + + def test_is_correct_linker(self): + self.assertIsInstance(self.linker, embedding_linker.Linker) def assert_has_name(self, out_ents: dict[int, Entity], name: str): self.assertTrue( diff --git a/medcat-v2/medcat/components/types.py b/medcat-v2/medcat/components/types.py index f57bd4ec3..e883d4a20 100644 --- a/medcat-v2/medcat/components/types.py +++ b/medcat-v2/medcat/components/types.py @@ -214,9 +214,6 @@ def train(self, cui: str, "medcat2_two_step_linker": ( "medcat.components.linking.two_step_context_based_linker", "TwoStepLinker.create_new_component"), - "medcat2_embedding_linker": ( - "medcat.components.linking.embedding_linker", - "Linker.create_new_component"), # primary name only "primary_name_only_linker": ( "medcat.components.linking.only_primary_name_linker", diff --git a/medcat-v2/medcat/config/config.py b/medcat-v2/medcat/config/config.py index a8bd8aa08..b3306abb4 100644 --- a/medcat-v2/medcat/config/config.py +++ b/medcat-v2/medcat/config/config.py @@ -423,49 +423,6 @@ class Linking(ComponentConfig): extra='allow', ) -class EmbeddingLinking(Linking): - """The config exclusively used for the embedding linker""" - comp_name: str = "medcat2_embedding_linker" - """Changing compoenent name""" - filter_before_disamb: bool = False - """Filtering CUIs before disambiguation""" - train: bool = False - """The embedding linker never needs to be trained in its - current implementation.""" - long_similarity_threshold: float = 0.0 - """Used in the inference step to choose the best CUI given the - link candidates. Testing shows a threshold of 0.7 increases precision - with minimal impact on recall. Default is 0.0 which assumes - all entities detected by the NER step are true.""" - short_similarity_threshold: float = 0.0 - """Used for generating cui candidates. If a threshold of 0.0 - is selected then only the highest scoring name will provide cuis - to be link candidates. Use a threshold of 0.95 or higher, as this is - essentailly string matching and account for spelling errors. Lower - thresholds will provide too many candidates and slow down the inference.""" - embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2" - """Name of the embedding model. It must be downloadable from - huggingface linked from an appropriate file directory""" - max_token_length: int = 64 - """Max number of tokens to be embedded from a name. - If the max token length is changed then the linker will need to be created - with a new config. - """ - embedding_batch_size: int = 4096 - """How many pieces names can be embedded at once, useful when - embedding name2info names, cui2info names""" - linking_batch_size: int = 512 - """How many entities to be linked at once""" - gpu_device: Optional[Any] = None - """Choose a device for the linking model to be stored. If None - then an appropriate GPU device that is available will be chosen""" - context_window_size: int = 14 - """Choose the window size to get context vectors.""" - use_ner_link_candidates: bool = True - """Link candidates are provided by some NER steps. This will flag if - you want to trust them or not.""" - use_similarity_threshold: bool = True - """Do we have a similarity threshold we care about?""" class Preprocessing(SerialisableBaseModel): """The preprocessing part of the config""" diff --git a/medcat-v2/medcat/plugins/data/plugin_catalog.json b/medcat-v2/medcat/plugins/data/plugin_catalog.json index 031f9172c..eb1d1e46b 100644 --- a/medcat-v2/medcat/plugins/data/plugin_catalog.json +++ b/medcat-v2/medcat/plugins/data/plugin_catalog.json @@ -19,6 +19,23 @@ } ], "requires_auth": true + }, + "medcat-embedding-linker": { + "name": "medcat-embedding-linker", + "display_name": "MedCAT-embedding-linker", + "description": "Embedding based linker for MedCAT", + "source_spec": { + "source": "medcat-embedding-linker", + "source_type": "pypi" + }, + "homepage": "https://github.com/CogStack/cogstack-nlp/tree/main/medcat-plugins/medcat-embedding-linker", + "compatibility": [ + { + "medcat_version": ">=2.5.0,<3.0.0", + "plugin_version": "0.1.0" + } + ], + "requires_auth": false } } } diff --git a/medcat-v2/pyproject.toml b/medcat-v2/pyproject.toml index a46e0076c..26179a9b6 100644 --- a/medcat-v2/pyproject.toml +++ b/medcat-v2/pyproject.toml @@ -142,6 +142,7 @@ include = ["medcat*"] [tool.setuptools.package-data] "medcat.plugins.data" = ["plugin_catalog.json"] +"medcat" = ["py.typed"] [tool.setuptools_scm] # look for .git folder in root of repo