shcherbak-ai
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md
Lines changed: 4 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎NOTICE
Lines changed: 1 addition & 1 deletion b/‎NOTICE
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 31 additions & 28 deletions b/‎README.md
Lines changed: 31 additions & 28 deletions
diff --git a/‎contextgem/__init__.py
Lines changed: 3 additions & 1 deletion b/‎contextgem/__init__.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎contextgem/internal/__init__.py
Lines changed: 4 additions & 0 deletions b/‎contextgem/internal/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎contextgem/internal/base/llms.py
Lines changed: 16 additions & 4 deletions b/‎contextgem/internal/base/llms.py
Lines changed: 16 additions & 4 deletions
diff --git a/‎contextgem/internal/items.py
Lines changed: 29 additions & 0 deletions b/‎contextgem/internal/items.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎contextgem/internal/llm_output_structs/__init__.py
Lines changed: 2 additions & 0 deletions b/‎contextgem/internal/llm_output_structs/__init__.py
Lines changed: 2 additions & 0 deletions
@@ -41,7 +41,7 @@ repos:
       - id: export-requirements
         name: Export requirements files
         entry: python
-        args: ["-c", "import subprocess; subprocess.run(['poetry', 'export', '-f', 'requirements.txt', '--output', 'dev/requirements/requirements.main.txt', '--without-hashes']); subprocess.run(['poetry', 'export', '-f', 'requirements.txt', '--output', 'dev/requirements/requirements.dev.txt', '--with', 'dev', '--without-hashes'])"]
+        args: ["-c", "import subprocess; subprocess.run(['poetry', 'export', '-f', 'requirements.txt', '--output', 'dev/requirements/requirements.main.txt']); subprocess.run(['poetry', 'export', '-f', 'requirements.txt', '--output', 'dev/requirements/requirements.dev.txt', '--with', 'dev'])"]
         language: python
         pass_filenames: false
         always_run: true
 
@@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 - **Refactor**: Code reorganization that doesn't change functionality but improves structure or maintainability
 
+## [0.6.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.6.0) - 2025-06-03
+### Added
+- LabelConcept - a classification concept type that categorizes content using predefined labels.
+
 ## [0.5.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.5.0) - 2025-05-29
 ### Fixed
 - Params handling for reasoning (CoT-capable) models other than OpenAI o-series. Enabled automatic retry of LLM calls with dropping unsupported params if such unsupported params were set for the model. Improved handling and validation of LLM call params.
 
@@ -104,7 +104,7 @@ To sign the agreement:
    pytest
    ```
 
-   Please note that we use pytest-vcr to record and replay LLM API interactions. Your changes may require re-recording VCR cassettes for the tests. See [VCR Cassette Management](#vcr-cassette-management) section below for details.
+   Please note that we use [pytest-recording](https://github.com/kiwicom/pytest-recording) to record and replay LLM API interactions. Your changes may require re-recording VCR cassettes for the tests. See [VCR Cassette Management](#vcr-cassette-management) section below for details.
 
 4. **Commit your changes** using Conventional Commits format:
 
@@ -171,7 +171,7 @@ By submitting issues or feature requests to this project, you acknowledge that t
 
 ### VCR Cassette Management
 
-We use pytest-vcr to record and replay HTTP interactions with LLM APIs. This allows tests to run without making actual API calls after the initial recording.
+We use [pytest-recording](https://github.com/kiwicom/pytest-recording) to record and replay HTTP interactions with LLM APIs. This allows tests to run without making actual API calls after the initial recording.
 
 #### When to Re-record Cassettes
 
 
@@ -42,7 +42,7 @@ Development Dependencies:
 - pre-commit: Pre-commit hooks
 - pytest: Testing framework
 - pytest-cov: Coverage plugin for pytest
-- pytest-vcr: Recording HTTP interactions for tests
+- pytest-recording: Recording HTTP interactions for tests
 - python-dotenv: Environment variable management
 - sphinx: Documentation generator
 - sphinx-autodoc-typehints: Type annotation support for Sphinx
 
@@ -17,20 +17,23 @@
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-blue?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](CODE_OF_CONDUCT.md)
 [![DeepWiki](https://img.shields.io/static/v1?label=DeepWiki&message=Chat%20with%20Code&labelColor=%23283593&color=%237E57C2&style=flat-square)](https://deepwiki.com/shcherbak-ai/contextgem)
+[![GitHub latest commit](https://img.shields.io/github/last-commit/shcherbak-ai/contextgem?label=latest%20commit)](https://github.com/shcherbak-ai/contextgem/commits/main)
 
 <img src="https://contextgem.dev/_static/tab_solid.png" alt="ContextGem: 2nd Product of the week" width="250">
 <br/><br/>
 
 ContextGem is a free, open-source LLM framework that makes it radically easier to extract structured data and insights from documents — with minimal code.
 
+---
+
 
 ## 💎 Why ContextGem?
 
 Most popular LLM frameworks for extracting structured data from documents require extensive boilerplate code to extract even basic information. This significantly increases development time and complexity.
 
 ContextGem addresses this challenge by providing a flexible, intuitive framework that extracts structured data and insights from documents with minimal effort. Complex, most time-consuming parts are handled with **powerful abstractions**, eliminating boilerplate code and reducing development overhead.
 
-Read more on the project [motivation](https://contextgem.dev/motivation.html) in the documentation.
+📖 Read more on the project [motivation](https://contextgem.dev/motivation.html) in the documentation.
 
 
 ## ⭐ Key features
@@ -158,8 +161,9 @@ Read more on the project [motivation](https://contextgem.dev/motivation.html) in
 
 \* See [descriptions](https://contextgem.dev/motivation.html#the-contextgem-solution) of ContextGem abstractions and [comparisons](https://contextgem.dev/vs_other_frameworks.html) of specific implementation examples using ContextGem and other popular open-source LLM frameworks.
 
+## 💡 What you can build
 
-## 💡 With **minimal code**, you can:
+With **minimal code**, you can:
 
 - **Extract structured data** from documents (text, images)
 - **Identify and analyze key aspects** (topics, themes, categories) within documents ([learn more](https://contextgem.dev/aspects/aspects.html))
@@ -253,17 +257,17 @@ for item in anomalies_concept.extracted_items:
 
 ---
 
-See more examples in the documentation:
+### 📚 More Examples
 
-### Basic usage examples
+**Basic usage:**
 - [Aspect Extraction from Document](https://contextgem.dev/quickstart.html#aspect-extraction-from-document)
 - [Extracting Aspect with Sub-Aspects](https://contextgem.dev/quickstart.html#extracting-aspect-with-sub-aspects)
 - [Concept Extraction from Aspect](https://contextgem.dev/quickstart.html#concept-extraction-from-aspect)
 - [Concept Extraction from Document (text)](https://contextgem.dev/quickstart.html#concept-extraction-from-document-text)
 - [Concept Extraction from Document (vision)](https://contextgem.dev/quickstart.html#concept-extraction-from-document-vision)
 - [LLM chat interface](https://contextgem.dev/quickstart.html#lightweight-llm-chat-interface)
 
-### Advanced usage examples
+**Advanced usage:**
 - [Extracting Aspects Containing Concepts](https://contextgem.dev/advanced_usage.html#extracting-aspects-with-concepts)
 - [Extracting Aspects and Concepts from a Document](https://contextgem.dev/advanced_usage.html#extracting-aspects-and-concepts-from-a-document)
 - [Using a Multi-LLM Pipeline to Extract Data from Several Documents](https://contextgem.dev/advanced_usage.html#using-a-multi-llm-pipeline-to-extract-data-from-several-documents)
@@ -302,15 +306,13 @@ docx_text = converter.convert_to_text_format(
 
 ```
 
-Learn more about [DOCX converter features](https://contextgem.dev/converters/docx.html) in the documentation.
-
+📖 Learn more about [DOCX converter features](https://contextgem.dev/converters/docx.html) in the documentation.
 
 ## 🎯 Focused document analysis
 
 ContextGem leverages LLMs' long context windows to deliver superior extraction accuracy from individual documents. Unlike RAG approaches that often [struggle with complex concepts and nuanced insights](https://www.linkedin.com/pulse/raging-contracts-pitfalls-rag-contract-review-shcherbak-ai-ptg3f), ContextGem capitalizes on continuously expanding context capacity, evolving LLM capabilities, and decreasing costs. This focused approach enables direct information extraction from complete documents, eliminating retrieval inconsistencies while optimizing for in-depth single-document analysis. While this delivers higher accuracy for individual documents, ContextGem does not currently support cross-document querying or corpus-wide retrieval - for these use cases, modern RAG systems (e.g., LlamaIndex, Haystack) remain more appropriate.
 
-Read more on [how ContextGem works](https://contextgem.dev/how_it_works.html) in the documentation.
-
+📖 Read more on [how ContextGem works](https://contextgem.dev/how_it_works.html) in the documentation.
 
 ## 🤖 Supported LLMs
 
@@ -320,8 +322,7 @@ ContextGem supports both cloud-based and local LLMs through [LiteLLM](https://gi
 - **Model Architectures**: Works with both reasoning/CoT-capable (e.g. o4-mini) and non-reasoning models (e.g. gpt-4.1)
 - **Simple API**: Unified interface for all LLMs with easy provider switching
 
-Learn more about [supported LLM providers and models](https://contextgem.dev/llms/supported_llms.html), how to [configure LLMs](https://contextgem.dev/llms/llm_config.html), and [LLM extraction methods](https://contextgem.dev/llms/llm_extraction_methods.html) in the documentation.
-
+📖 Learn more about [supported LLM providers and models](https://contextgem.dev/llms/supported_llms.html), how to [configure LLMs](https://contextgem.dev/llms/llm_config.html), and [LLM extraction methods](https://contextgem.dev/llms/llm_extraction_methods.html) in the documentation.
 
 ## ⚡ Optimizations
 
@@ -342,36 +343,35 @@ ContextGem allows you to save and load Document objects, pipelines, and LLM conf
 - Transfer extraction results between systems
 - Persist pipeline and LLM configurations for later reuse
 
-Learn more about [serialization options](https://contextgem.dev/serialization.html) in the documentation.
-
+📖 Learn more about [serialization options](https://contextgem.dev/serialization.html) in the documentation.
 
 ## 📚 Documentation
 
-Full documentation is available at [contextgem.dev](https://contextgem.dev).
-
-A raw text version of the full documentation is available at [`docs/docs-raw-for-llm.txt`](https://github.com/shcherbak-ai/contextgem/blob/main/docs/docs-raw-for-llm.txt). This file is automatically generated and contains all documentation in a format optimized for LLM ingestion (e.g. for Q&A).
+📖 **Full documentation:** [contextgem.dev](https://contextgem.dev)
 
-You can also explore the repository through [DeepWiki](https://deepwiki.com/shcherbak-ai/contextgem), an AI-powered conversational interface that provides visual architecture maps and natural language Q&A for the codebase.
+📄 **Raw documentation for LLMs:** Available at [`docs/docs-raw-for-llm.txt`](https://github.com/shcherbak-ai/contextgem/blob/main/docs/docs-raw-for-llm.txt) - automatically generated, optimized for LLM ingestion.
 
-For a history of changes, improvements, and bug fixes, see the [CHANGELOG](https://github.com/shcherbak-ai/contextgem/blob/main/CHANGELOG.md).
+🤖 **AI-powered code exploration:** [DeepWiki](https://deepwiki.com/shcherbak-ai/contextgem) provides visual architecture maps and natural language Q&A for the codebase.
 
+📈 **Change history:** See the [CHANGELOG](https://github.com/shcherbak-ai/contextgem/blob/main/CHANGELOG.md) for version history, improvements, and bug fixes.
 
 ## 💬 Community
 
-If you have a feature request or a bug report, feel free to [open an issue](https://github.com/shcherbak-ai/contextgem/issues/new) on GitHub. If you'd like to discuss a topic or get general advice on using ContextGem for your project, start a thread in [GitHub Discussions](https://github.com/shcherbak-ai/contextgem/discussions/new/).
+🐛 **Found a bug or have a feature request?** [Open an issue](https://github.com/shcherbak-ai/contextgem/issues/new) on GitHub.
 
+💭 **Need help or want to discuss?** Start a thread in [GitHub Discussions](https://github.com/shcherbak-ai/contextgem/discussions/new/).
 
 ## 🤝 Contributing
 
-We welcome contributions from the community - whether it's fixing a typo or developing a completely new feature! To get started, please check out our [Contributor Guidelines](https://github.com/shcherbak-ai/contextgem/blob/main/CONTRIBUTING.md).
+We welcome contributions from the community - whether it's fixing a typo or developing a completely new feature! 
 
+📋 **Get started:** Check out our [Contributor Guidelines](https://github.com/shcherbak-ai/contextgem/blob/main/CONTRIBUTING.md).
 
 ## 🔐 Security
 
 This project is automatically scanned for security vulnerabilities using [CodeQL](https://codeql.github.com/). We also use [Snyk](https://snyk.io) as needed for supplementary dependency checks.
 
-See [SECURITY](https://github.com/shcherbak-ai/contextgem/blob/main/SECURITY.md) file for details.
-
+🛡️ **Security policy:** See [SECURITY](https://github.com/shcherbak-ai/contextgem/blob/main/SECURITY.md) file for details.
 
 ## 💖 Acknowledgements
 
@@ -388,17 +388,20 @@ ContextGem relies on these excellent open-source packages:
 
 ## 🌱 Support the project
 
-ContextGem is just getting started, and your support means the world to us! If you find ContextGem useful, the best way to help is by sharing it with others and giving the project a ⭐. Your feedback and contributions are what make this project grow!
+ContextGem is just getting started, and your support means the world to us! 
 
+⭐ **Star the project** if you find ContextGem useful  
+📢 **Share it** with others who might benefit  
+🔧 **Contribute** with feedback, issues, or code improvements
 
-## 📄 License & Contact
+Your engagement is what makes this project grow!
 
-This project is licensed under the Apache 2.0 License - see the [LICENSE](https://github.com/shcherbak-ai/contextgem/blob/main/LICENSE) and [NOTICE](https://github.com/shcherbak-ai/contextgem/blob/main/NOTICE) files for details.
+## 📄 License & Contact
 
-Copyright © 2025 [Shcherbak AI AS](https://shcherbak.ai), an AI engineering company building tools for AI/ML/NLP developers.
+**License:** Apache 2.0 License - see the [LICENSE](https://github.com/shcherbak-ai/contextgem/blob/main/LICENSE) and [NOTICE](https://github.com/shcherbak-ai/contextgem/blob/main/NOTICE) files for details.
 
-Shcherbak AI is now part of Microsoft for Startups.
+**Copyright:** © 2025 [Shcherbak AI AS](https://shcherbak.ai), an AI engineering company building tools for AI/ML/NLP developers.
 
-[Connect with us on LinkedIn](https://www.linkedin.com/in/sergii-shcherbak-10068866/) for questions or collaboration ideas.
+**Connect:** [LinkedIn](https://www.linkedin.com/in/sergii-shcherbak-10068866/) for questions or collaboration ideas.
 
 Built with ❤️ in Oslo, Norway.
@@ -20,7 +20,7 @@
 ContextGem - Effortless LLM extraction from documents
 """
 
-__version__ = "0.5.0"
+__version__ = "0.6.0"
 __author__ = "Shcherbak AI AS"
 
 from contextgem.public import (
@@ -36,6 +36,7 @@
     JsonObjectClassStruct,
     JsonObjectConcept,
     JsonObjectExample,
+    LabelConcept,
     LLMPricing,
     NumericalConcept,
     Paragraph,
@@ -58,6 +59,7 @@
     "RatingConcept",
     "JsonObjectConcept",
     "DateConcept",
+    "LabelConcept",
     # Documents
     "Document",
     # Pipelines
 
@@ -52,11 +52,13 @@
     _IntegerItem,
     _IntegerOrFloatItem,
     _JsonObjectItem,
+    _LabelItem,
     _StringItem,
 )
 from contextgem.internal.llm_output_structs import (
     _get_aspect_extraction_output_struct,
     _get_concept_extraction_output_struct,
+    _LabelConceptItemValueModel,
 )
 from contextgem.internal.loggers import logger
 from contextgem.internal.typings import (
@@ -119,6 +121,7 @@
     # LLM output structs
     "_get_aspect_extraction_output_struct",
     "_get_concept_extraction_output_struct",
+    "_LabelConceptItemValueModel",
     # Typings
     "NonEmptyStr",
     "LLMRoleAny",
@@ -162,6 +165,7 @@
     "_BooleanItem",
     "_JsonObjectItem",
     "_DateItem",
+    "_LabelItem",
     # Logging
     "logger",
     # Utils
 
@@ -1465,9 +1465,15 @@ def merge_usage_data(existing: _LLMUsage | None, new: _LLMUsage) -> _LLMUsage:
                     if add_justifications or add_references:
                         for i in concept_dict["extracted_items"]:
                             # Process the item value with a custom function on the concept
-                            i["value"] = relevant_concept._process_item_value(
-                                i["value"]
-                            )
+                            try:
+                                i["value"] = relevant_concept._process_item_value(
+                                    i["value"]
+                                )
+                            except ValueError as e:
+                                logger.error(
+                                    f"Error processing extracted item value: {e}"
+                                )
+                                return None, all_usage_data
                             concept_extracted_item_kwargs = {"value": i["value"]}
                             if add_justifications:
                                 concept_extracted_item_kwargs["justification"] = i[
@@ -1569,7 +1575,13 @@ def merge_usage_data(existing: _LLMUsage | None, new: _LLMUsage) -> _LLMUsage:
                     else:
                         for i in concept_dict["extracted_items"]:
                             # Process the item value with a custom function on the concept
-                            i = relevant_concept._process_item_value(i)
+                            try:
+                                i = relevant_concept._process_item_value(i)
+                            except ValueError as e:
+                                logger.error(
+                                    f"Error processing extracted item value: {e}"
+                                )
+                                return None, all_usage_data
                             sources_mapper[relevant_concept.unique_id][
                                 "extracted_items"
                             ].append(relevant_concept._item_class(value=i))
 
@@ -208,3 +208,32 @@ def from_dict(cls, obj_dict: dict[str, Any]) -> Self:
 
         # Use the parent class's from_dict method
         return super().from_dict(obj_dict_copy)
+
+
+class _LabelItem(_ExtractedItem):
+    """
+    Represents an extracted item that holds a list of label values.
+
+    :ivar value: A list of label strings. Always returns a list for API consistency,
+        containing one or more labels depending on the classification type.
+    :type value: list[NonEmptyStr]
+    """
+
+    value: list[NonEmptyStr] = Field(..., min_length=1, frozen=True)
+
+    @field_validator("value")
+    @classmethod
+    def _validate_value(cls, value: list[NonEmptyStr]) -> list[NonEmptyStr]:
+        """
+        Validates the input list of labels. Ensures there are no duplicates in the list.
+
+        :param value: List of label strings to validate.
+        :type value: list[NonEmptyStr]
+        :return: The same list provided as input, if it passes validation.
+        :rtype: list[NonEmptyStr]
+        :raises ValueError: If the list contains duplicate labels.
+        """
+        if len(value) != len(set(value)):
+            raise ValueError("_LabelItem value cannot contain duplicate labels.")
+
+        return value
@@ -21,6 +21,7 @@
 )
 from contextgem.internal.llm_output_structs.concept_structs import (
     _get_concept_extraction_output_struct,
+    _LabelConceptItemValueModel,
 )
 from contextgem.internal.llm_output_structs.utils import _create_root_model
 
@@ -31,4 +32,5 @@
     "_get_aspect_extraction_output_struct",
     # Concept structs
     "_get_concept_extraction_output_struct",
+    "_LabelConceptItemValueModel",
 ]
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@`
`21`	`21`	`)`
`22`	`22`	`from contextgem.internal.llm_output_structs.concept_structs import (`
`23`	`23`	`_get_concept_extraction_output_struct,`
	`24`	`+ _LabelConceptItemValueModel,`
`24`	`25`	`)`
`25`	`26`	`from contextgem.internal.llm_output_structs.utils import _create_root_model`
`26`	`27`
`@@ -31,4 +32,5 @@`
`31`	`32`	`"_get_aspect_extraction_output_struct",`
`32`	`33`	`# Concept structs`
`33`	`34`	`"_get_concept_extraction_output_struct",`
	`35`	`+ "_LabelConceptItemValueModel",`
`34`	`36`	`]`