diff --git a/generators/__init__.py b/generators/__init__.py index 43db4b7b..b3cb7845 100644 --- a/generators/__init__.py +++ b/generators/__init__.py @@ -69,6 +69,8 @@ nltk_ngram_generator ) +from .phonetic_transcription import phonetic_transcriptor + router = APIRouter() for module in [ @@ -106,7 +108,8 @@ newline_splitter, tiktoken_token_counter, noun_splitter, - nltk_ngram_generator, + nltk_ngram_generator, + phonetic_transcriptor ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/generators/phonetic_transcription/phonetic_transcriptor/README.md b/generators/phonetic_transcription/phonetic_transcriptor/README.md new file mode 100644 index 00000000..4589e368 --- /dev/null +++ b/generators/phonetic_transcription/phonetic_transcriptor/README.md @@ -0,0 +1,12 @@ + +This epitran-based phonetic transcriptor generates phonetic transcriptions of words in a given text. Additionally, it utilizes the CEDICT dictionary database, which provides English definitions for Chinese characters, and the jieba tokenizer for the Chinese language. + +Dependencies: +- Epitran: Library for transliterating orthographic text to IPA (International Phonetic Alphabet). Link: https://github.com/dmort27/epitran +- CEDICT: Comprehensive English-Chinese dictionary database. Link: https://www.mdbg.net/chinese/dictionary?page=cedict +- Jieba: Chinese text segmentation tool. Link: https://github.com/fxsjy/jieba + +Usage: +To generate phonetic transcriptions, provide the text and a language code in the format of ISO 639-3 language codes and ISO 15924 script codes (e.g., "eng-Latn"). Supported languages and code formats can be found in the documentation for the epitran library: (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support). + + diff --git a/generators/phonetic_transcription/phonetic_transcriptor/__init__.py b/generators/phonetic_transcription/phonetic_transcriptor/__init__.py new file mode 100644 index 00000000..b678440f --- /dev/null +++ b/generators/phonetic_transcription/phonetic_transcriptor/__init__.py @@ -0,0 +1,92 @@ +from pydantic import BaseModel +import epitran +import jieba +import re + +INPUT_EXAMPLE = {"text": "Bright violets grow along the stream.", + "language_code": "eng-Latn"} + + +class PhoneticTranscriptorModel(BaseModel): + text: str + language_code: str + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + + +def phonetic_transcriptor(req: PhoneticTranscriptorModel): + """Generates phonetic transcription of each word from a given text. """ + text = req.text + language_code = req.language_code + # Tokenize based on language group + language_group = req.language_code.split('-')[1] + # Handle the case of Chinese lanugage separatly + if language_code in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(language_code, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(language_code) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + diff --git a/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_backup.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_backup.md new file mode 100644 index 00000000..6ac95ebe --- /dev/null +++ b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_backup.md @@ -0,0 +1,90 @@ +```python +import epitran +import jieba +import re + +# replace this list with a list containing your data +text = ["A gentle breeze sways the golden wheat."] + +# add the texts to a dict called records. Add further information as key-value pairs if needed +record = { + "text": text, + "language_code": "eng-Latn" +} + +def phonetic_transcriptor(record): + + text = record["text"][0] + language_code = record["language_code"] + # Tokenize based on language group + language_group = language_code.split('-')[1] + # Handle the case of Chinese lanugage separatly + if language_code in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(language_code, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(language_code) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + +``` diff --git a/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_common.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_common.md new file mode 100644 index 00000000..58b27b9f --- /dev/null +++ b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_common.md @@ -0,0 +1,111 @@ +```python +import epitran +import jieba +import re +from typing import Dict, List + +def phonetic_transcriptor(text: str, language_code: str) -> Dict[str, List[str]]: + """ + Generate phonetic transcription of words in the given text. + + @param text: Text for which to generate phonetic transcription. + @param language_code: language codes formatted as a combination of ISO 639-3 language codes and ISO 15924 script codes, e.g. "eng-Latn". Supported languages and code formats are as in epitran library this function relies on (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support). + + @return: A dictionary with two keys: + - "tokens": a list of words (or tokens) extracted from the input text. + - "phonetic_transcriptions": a corresponding list of phonetic transcriptions for each token. + + Example: + >>> phonetic_transcriptor("hello world", "eng-Latn") + {'tokens': ['hello', 'world'], 'phonetic_transcriptions': ['həˈloʊ', 'wɜrld']} + + Note: Unsupported language codes or invalid inputs may result in errors or empty transcriptions. + Performance may vary based on text length and complexity. + """ + # Tokenize based on language group + language_group = language_code.split('-')[1] + # Handle the case of Chinese lanugage separatly + if language_code in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(language_code, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(language_code) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +def example_integration(): + texts = [ + "Clouds drift across the clear blue sky.", + "Wolken ziehen über den klaren blauen Himmel.", + "云彩在晴朗的蓝天中飘过。", + "बादल साफ़ नीले आकाश में तैरते हैं।"] + language_codes = ["eng-Latn", "deu-Latn-np", "cmn-Hans", "hin-Deva"] + for text, language_code in zip (texts, language_codes): + print(f"Phonetic transcription for \"{text}\" is: {phonetic_transcriptor(text, language_code)}") + +example_integration() +``` diff --git a/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_refinery.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_refinery.md new file mode 100644 index 00000000..a3433652 --- /dev/null +++ b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_refinery.md @@ -0,0 +1,84 @@ +```python +import epitran +import jieba +import re +import json + +ATTRIBUTE: str = "text" +LANGUAGE_CODE: str = "eng-Latn" + +def phonetic_transcriptor(record): + + text = record["ATTRIBUTE"].text + # Tokenize based on language group + language_group = LANGUAGE_CODE.split('-')[1] + # Handle the case of Chinese lanugage separatly + if LANGUAGE_CODE in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(LANGUAGE_CODE, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return json.dumps({"tokens": tokens, + "phonetic_transcriptions": result}, ensure_ascii=False) + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(LANGUAGE_CODE) + result = [epi.transliterate(token) for token in tokens] + return json.dumps({"tokens": tokens, + "phonetic_transcriptions": result}, ensure_ascii=False) + +``` diff --git a/generators/phonetic_transcription/phonetic_transcriptor/config.py b/generators/phonetic_transcription/phonetic_transcriptor/config.py new file mode 100644 index 00000000..30dfffce --- /dev/null +++ b/generators/phonetic_transcription/phonetic_transcriptor/config.py @@ -0,0 +1,39 @@ +from util.configs import build_generator_function_config +from util.enums import State, RefineryDataType, BricksVariableType, SelectionType +from . import phonetic_transcriptor, INPUT_EXAMPLE + + +def get_config(): + return build_generator_function_config( + function=phonetic_transcriptor, + input_example=INPUT_EXAMPLE, + issue_id=278, + tabler_icon="AlphabetGreek", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + available_for=["refinery", "common"], + part_of_group=[ + "phonetic_transcription", + ], # first entry should be parent directory + # bricks integrator information + integrator_inputs={ + "name": "phonetic_transcriptor", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value, + ], + }, + "LANGUAGE_CODE": { + "selectionType": SelectionType.STRING.value, + "addInfo": [ + BricksVariableType.GENERIC_STRING.value + ] + } + }, + }, + ) diff --git a/requirements.txt b/requirements.txt index 9f557191..1b147465 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,6 @@ textacy==0.12.0 scikit-optimize==0.9.0 holidays==0.21.13 sumy==0.11.0 -tiktoken==0.4.0 \ No newline at end of file +tiktoken==0.4.0 +epitran==1.24 +jieba==0.42.1 \ No newline at end of file