From 483b83ce30119247e7ed401bdd572d9fdeda483a Mon Sep 17 00:00:00 2001 From: springlaughing Date: Thu, 8 Feb 2024 03:47:25 +0100 Subject: [PATCH 1/4] New directory in generators added, containing files for new phonetic_transcription brick --- generators/phonetic_transcription/README.md | 1 + generators/phonetic_transcription/__init__.py | 92 ++++++++++++++++++ .../code_snippet_backup.md | 90 ++++++++++++++++++ .../code_snippet_common.md | 94 +++++++++++++++++++ .../code_snippet_refinery.md | 79 ++++++++++++++++ generators/phonetic_transcription/config.py | 33 +++++++ 6 files changed, 389 insertions(+) create mode 100644 generators/phonetic_transcription/README.md create mode 100644 generators/phonetic_transcription/__init__.py create mode 100644 generators/phonetic_transcription/code_snippet_backup.md create mode 100644 generators/phonetic_transcription/code_snippet_common.md create mode 100644 generators/phonetic_transcription/code_snippet_refinery.md create mode 100644 generators/phonetic_transcription/config.py diff --git a/generators/phonetic_transcription/README.md b/generators/phonetic_transcription/README.md new file mode 100644 index 00000000..92bb4639 --- /dev/null +++ b/generators/phonetic_transcription/README.md @@ -0,0 +1 @@ +Reduces all tokens in a text to their base form with the use of a vocabulary and morphological analysis of the tokens. Uses a spaCy model, see official documentation here: https://spacy.io/api/lemmatizer \ No newline at end of file diff --git a/generators/phonetic_transcription/__init__.py b/generators/phonetic_transcription/__init__.py new file mode 100644 index 00000000..7dfa9e06 --- /dev/null +++ b/generators/phonetic_transcription/__init__.py @@ -0,0 +1,92 @@ +from pydantic import BaseModel +import epitran +import jieba +import re + +INPUT_EXAMPLE = {"text": "Bright violets grow along the stream.", + "language_code": "eng-Latn"} + + +class PhoneticTranscription(BaseModel): + text: str + language_code: str + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + + +def phonetic_transcriptor(req: PhoneticTranscription): + """Generates phonetic transcription of each word from a given text. """ + text = req.text + language_code = req.language_code + # Tokenize based on language group + language_group = req.language_code.split('-')[1] + # Handle the case of Chinese lanugage separatly + if language_code in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(language_code, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(language_code) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + diff --git a/generators/phonetic_transcription/code_snippet_backup.md b/generators/phonetic_transcription/code_snippet_backup.md new file mode 100644 index 00000000..6ac95ebe --- /dev/null +++ b/generators/phonetic_transcription/code_snippet_backup.md @@ -0,0 +1,90 @@ +```python +import epitran +import jieba +import re + +# replace this list with a list containing your data +text = ["A gentle breeze sways the golden wheat."] + +# add the texts to a dict called records. Add further information as key-value pairs if needed +record = { + "text": text, + "language_code": "eng-Latn" +} + +def phonetic_transcriptor(record): + + text = record["text"][0] + language_code = record["language_code"] + # Tokenize based on language group + language_group = language_code.split('-')[1] + # Handle the case of Chinese lanugage separatly + if language_code in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(language_code, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(language_code) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + +``` diff --git a/generators/phonetic_transcription/code_snippet_common.md b/generators/phonetic_transcription/code_snippet_common.md new file mode 100644 index 00000000..022cd7f4 --- /dev/null +++ b/generators/phonetic_transcription/code_snippet_common.md @@ -0,0 +1,94 @@ +```python +import epitran +import jieba +import re +from typing import Dict, List + +def phonetic_transcriptor(text: str, language_code: str) -> Dict[str, List[str]]: + # Tokenize based on language group + language_group = language_code.split('-')[1] + # Handle the case of Chinese lanugage separatly + if language_code in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(language_code, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(language_code) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +def example_integration(): + texts = [ + "Clouds drift across the clear blue sky.", + "Wolken ziehen über den klaren blauen Himmel.", + "云彩在晴朗的蓝天中飘过。", + "बादल साफ़ नीले आकाश में तैरते हैं।"] + language_codes = ["eng-Latn", "deu-Latn-np", "cmn-Hans", "hin-Deva"] + for text, language_code in zip (texts, language_codes): + print(f"Phonetic transcription for \"{text}\" is: {phonetic_transcriptor(text, language_code)}") + +example_integration() +``` diff --git a/generators/phonetic_transcription/code_snippet_refinery.md b/generators/phonetic_transcription/code_snippet_refinery.md new file mode 100644 index 00000000..2c29922d --- /dev/null +++ b/generators/phonetic_transcription/code_snippet_refinery.md @@ -0,0 +1,79 @@ +```python +ATTRIBUTE: str = "text" +LANGUAGE_CODE: str = "eng-Latn" + +def phonetic_transcriptor(record): + + text = record["ATTRIBUTE"].text + # Tokenize based on language group + language_group = LANGUAGE_CODE.split('-')[1] + # Handle the case of Chinese lanugage separatly + if LANGUAGE_CODE in ('cmn-Hans', 'cmn-Hant'): + text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text)) + # Use tokenizer for Chinese + tokens = [*jieba.cut(text_without_punctuation)] + # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters + cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" + epi = epitran.Epitran(LANGUAGE_CODE, cedict_file=cedict_path) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + + if language_group == 'Arab': + # Tokenize Arabic script using regex + tokens = re.findall(r'[\u0600-\u06FF]+', text) + elif language_group == 'Beng': + # Tokenize Bengali script using regex + tokens = re.findall(r'[\u0980-\u09FF]+', text) + elif language_group == 'Cyrl': + # Tokenize Cyrillic script using regex + tokens = re.findall(r'[\u0400-\u04FF]+', text) + elif language_group == 'Deva': + # Tokenize Devanagari script using regex + tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text) + elif language_group == 'Ethi': + # Tokenize Ethiopic script using regex + tokens = re.findall(r'[\u1200-\u137F]+', text) + elif language_group == 'Guru': + # Tokenize Gurmukhi script using regex + tokens = re.findall(r'[\u0A00-\u0A7F]+', text) + elif language_group == 'Khmr': + # Tokenize Khmer script using regex + tokens = re.findall(r'[\u1780-\u17FF]+', text) + elif language_group == 'Laoo': + # Tokenize Lao script using regex + tokens = re.findall(r'[\u0E80-\u0EFF]+', text) + elif language_group == 'Latn': + # Tokenize Latin script using + tokens = re.findall(r'\b\w+\b', text) + elif language_group == 'Mlym': + # Tokenize Malayalam script using + tokens = re.findall(r'[\u0D00-\u0D7F]+', text) + elif language_group == 'Mymr': + # Tokenize Burmese script using regex + tokens = re.findall(r'[\u1000-\u109F]+', text) + elif language_group == 'Orya': + # Tokenize Oriya script using regex + tokens = re.findall(r'[\u0B00-\u0B7F]+', text) + elif language_group == 'Sinh': + # Tokenize Sinhala script using regex + tokens = re.findall(r'[\u0D80-\u0DFF]+', text) + elif language_group == 'Syrc': + # Tokenize Syriac script using regex + tokens = re.findall(r'[\u0700-\u074F]+', text) + elif language_group == 'Taml': + # Tokenize Tamil script using regex + tokens = re.findall(r'[\u0B80-\u0BFF]+', text) + elif language_group == 'Telu': + # Tokenize Telugu script using regex + tokens = re.findall(r'[\u0C00-\u0C7F]+', text) + elif language_group == 'Thai': + # Tokenize Thai script using regex + tokens = re.findall(r'[\u0E00-\u0E7F]+', text) + + epi = epitran.Epitran(LANGUAGE_CODE) + result = [epi.transliterate(token) for token in tokens] + return {"tokens": tokens, + "phonetic_transcriptions": result} + +``` diff --git a/generators/phonetic_transcription/config.py b/generators/phonetic_transcription/config.py new file mode 100644 index 00000000..ab611aa7 --- /dev/null +++ b/generators/phonetic_transcription/config.py @@ -0,0 +1,33 @@ +from util.configs import build_generator_function_config +from util.enums import State, RefineryDataType, BricksVariableType, SelectionType +from . import spacy_lemmatizer, INPUT_EXAMPLE + + +def get_config(): + return build_generator_function_config( + function=spacy_lemmatizer, + input_example=INPUT_EXAMPLE, + issue_id=228, + tabler_icon="Transform", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + available_for=["refinery", "common"], + part_of_group=[ + "lemmatizer", + ], # first entry should be parent directory + # bricks integrator information + integrator_inputs={ + "name": "spacy_lemmatizer", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value, + ], + } + }, + }, + ) From b5ac38507f8731e068dc37716e772fe2c30707c5 Mon Sep 17 00:00:00 2001 From: springlaughing Date: Thu, 8 Feb 2024 20:50:57 +0100 Subject: [PATCH 2/4] Added phonetic_transcriptor, init, backup, common, refinery, config, readme files. This will be bricks issue 278. --- generators/phonetic_transcription/README.md | 1 - .../epitran_phonetic_transcriptor/README.md | 12 ++++++++++++ .../__init__.py | 0 .../code_snippet_backup.md | 0 .../code_snippet_common.md | 17 +++++++++++++++++ .../code_snippet_refinery.md | 13 +++++++++---- .../config.py | 18 ++++++++++++------ 7 files changed, 50 insertions(+), 11 deletions(-) delete mode 100644 generators/phonetic_transcription/README.md create mode 100644 generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/__init__.py (100%) rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/code_snippet_backup.md (100%) rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/code_snippet_common.md (80%) rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/code_snippet_refinery.md (92%) rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/config.py (66%) diff --git a/generators/phonetic_transcription/README.md b/generators/phonetic_transcription/README.md deleted file mode 100644 index 92bb4639..00000000 --- a/generators/phonetic_transcription/README.md +++ /dev/null @@ -1 +0,0 @@ -Reduces all tokens in a text to their base form with the use of a vocabulary and morphological analysis of the tokens. Uses a spaCy model, see official documentation here: https://spacy.io/api/lemmatizer \ No newline at end of file diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md new file mode 100644 index 00000000..4589e368 --- /dev/null +++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md @@ -0,0 +1,12 @@ + +This epitran-based phonetic transcriptor generates phonetic transcriptions of words in a given text. Additionally, it utilizes the CEDICT dictionary database, which provides English definitions for Chinese characters, and the jieba tokenizer for the Chinese language. + +Dependencies: +- Epitran: Library for transliterating orthographic text to IPA (International Phonetic Alphabet). Link: https://github.com/dmort27/epitran +- CEDICT: Comprehensive English-Chinese dictionary database. Link: https://www.mdbg.net/chinese/dictionary?page=cedict +- Jieba: Chinese text segmentation tool. Link: https://github.com/fxsjy/jieba + +Usage: +To generate phonetic transcriptions, provide the text and a language code in the format of ISO 639-3 language codes and ISO 15924 script codes (e.g., "eng-Latn"). Supported languages and code formats can be found in the documentation for the epitran library: (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support). + + diff --git a/generators/phonetic_transcription/__init__.py b/generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py similarity index 100% rename from generators/phonetic_transcription/__init__.py rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py diff --git a/generators/phonetic_transcription/code_snippet_backup.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md similarity index 100% rename from generators/phonetic_transcription/code_snippet_backup.md rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md diff --git a/generators/phonetic_transcription/code_snippet_common.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md similarity index 80% rename from generators/phonetic_transcription/code_snippet_common.md rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md index 022cd7f4..58b27b9f 100644 --- a/generators/phonetic_transcription/code_snippet_common.md +++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md @@ -5,6 +5,23 @@ import re from typing import Dict, List def phonetic_transcriptor(text: str, language_code: str) -> Dict[str, List[str]]: + """ + Generate phonetic transcription of words in the given text. + + @param text: Text for which to generate phonetic transcription. + @param language_code: language codes formatted as a combination of ISO 639-3 language codes and ISO 15924 script codes, e.g. "eng-Latn". Supported languages and code formats are as in epitran library this function relies on (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support). + + @return: A dictionary with two keys: + - "tokens": a list of words (or tokens) extracted from the input text. + - "phonetic_transcriptions": a corresponding list of phonetic transcriptions for each token. + + Example: + >>> phonetic_transcriptor("hello world", "eng-Latn") + {'tokens': ['hello', 'world'], 'phonetic_transcriptions': ['həˈloʊ', 'wɜrld']} + + Note: Unsupported language codes or invalid inputs may result in errors or empty transcriptions. + Performance may vary based on text length and complexity. + """ # Tokenize based on language group language_group = language_code.split('-')[1] # Handle the case of Chinese lanugage separatly diff --git a/generators/phonetic_transcription/code_snippet_refinery.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md similarity index 92% rename from generators/phonetic_transcription/code_snippet_refinery.md rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md index 2c29922d..a3433652 100644 --- a/generators/phonetic_transcription/code_snippet_refinery.md +++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md @@ -1,4 +1,9 @@ ```python +import epitran +import jieba +import re +import json + ATTRIBUTE: str = "text" LANGUAGE_CODE: str = "eng-Latn" @@ -16,8 +21,8 @@ def phonetic_transcriptor(record): cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt" epi = epitran.Epitran(LANGUAGE_CODE, cedict_file=cedict_path) result = [epi.transliterate(token) for token in tokens] - return {"tokens": tokens, - "phonetic_transcriptions": result} + return json.dumps({"tokens": tokens, + "phonetic_transcriptions": result}, ensure_ascii=False) if language_group == 'Arab': # Tokenize Arabic script using regex @@ -73,7 +78,7 @@ def phonetic_transcriptor(record): epi = epitran.Epitran(LANGUAGE_CODE) result = [epi.transliterate(token) for token in tokens] - return {"tokens": tokens, - "phonetic_transcriptions": result} + return json.dumps({"tokens": tokens, + "phonetic_transcriptions": result}, ensure_ascii=False) ``` diff --git a/generators/phonetic_transcription/config.py b/generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py similarity index 66% rename from generators/phonetic_transcription/config.py rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py index ab611aa7..30dfffce 100644 --- a/generators/phonetic_transcription/config.py +++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py @@ -1,24 +1,24 @@ from util.configs import build_generator_function_config from util.enums import State, RefineryDataType, BricksVariableType, SelectionType -from . import spacy_lemmatizer, INPUT_EXAMPLE +from . import phonetic_transcriptor, INPUT_EXAMPLE def get_config(): return build_generator_function_config( - function=spacy_lemmatizer, + function=phonetic_transcriptor, input_example=INPUT_EXAMPLE, - issue_id=228, - tabler_icon="Transform", + issue_id=278, + tabler_icon="AlphabetGreek", min_refinery_version="1.7.0", state=State.PUBLIC.value, type="python_function", available_for=["refinery", "common"], part_of_group=[ - "lemmatizer", + "phonetic_transcription", ], # first entry should be parent directory # bricks integrator information integrator_inputs={ - "name": "spacy_lemmatizer", + "name": "phonetic_transcriptor", "refineryDataType": RefineryDataType.TEXT.value, "variables": { "ATTRIBUTE": { @@ -27,6 +27,12 @@ def get_config(): BricksVariableType.ATTRIBUTE.value, BricksVariableType.GENERIC_STRING.value, ], + }, + "LANGUAGE_CODE": { + "selectionType": SelectionType.STRING.value, + "addInfo": [ + BricksVariableType.GENERIC_STRING.value + ] } }, }, From 3ef8879d2b495b0a22b4966e86b694547b13c3b5 Mon Sep 17 00:00:00 2001 From: springlaughing Date: Thu, 8 Feb 2024 21:18:56 +0100 Subject: [PATCH 3/4] Tested with FastAPI. --- generators/__init__.py | 5 ++++- .../README.md | 0 .../__init__.py | 4 ++-- .../code_snippet_backup.md | 0 .../code_snippet_common.md | 0 .../code_snippet_refinery.md | 0 .../config.py | 0 7 files changed, 6 insertions(+), 3 deletions(-) rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/README.md (100%) rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/__init__.py (97%) rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/code_snippet_backup.md (100%) rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/code_snippet_common.md (100%) rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/code_snippet_refinery.md (100%) rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/config.py (100%) diff --git a/generators/__init__.py b/generators/__init__.py index 43db4b7b..b3cb7845 100644 --- a/generators/__init__.py +++ b/generators/__init__.py @@ -69,6 +69,8 @@ nltk_ngram_generator ) +from .phonetic_transcription import phonetic_transcriptor + router = APIRouter() for module in [ @@ -106,7 +108,8 @@ newline_splitter, tiktoken_token_counter, noun_splitter, - nltk_ngram_generator, + nltk_ngram_generator, + phonetic_transcriptor ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md b/generators/phonetic_transcription/phonetic_transcriptor/README.md similarity index 100% rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md rename to generators/phonetic_transcription/phonetic_transcriptor/README.md diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py b/generators/phonetic_transcription/phonetic_transcriptor/__init__.py similarity index 97% rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py rename to generators/phonetic_transcription/phonetic_transcriptor/__init__.py index 7dfa9e06..b678440f 100644 --- a/generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py +++ b/generators/phonetic_transcription/phonetic_transcriptor/__init__.py @@ -7,7 +7,7 @@ "language_code": "eng-Latn"} -class PhoneticTranscription(BaseModel): +class PhoneticTranscriptorModel(BaseModel): text: str language_code: str @@ -15,7 +15,7 @@ class Config: schema_extra = {"example": INPUT_EXAMPLE} -def phonetic_transcriptor(req: PhoneticTranscription): +def phonetic_transcriptor(req: PhoneticTranscriptorModel): """Generates phonetic transcription of each word from a given text. """ text = req.text language_code = req.language_code diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_backup.md similarity index 100% rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md rename to generators/phonetic_transcription/phonetic_transcriptor/code_snippet_backup.md diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_common.md similarity index 100% rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md rename to generators/phonetic_transcription/phonetic_transcriptor/code_snippet_common.md diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_refinery.md similarity index 100% rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md rename to generators/phonetic_transcription/phonetic_transcriptor/code_snippet_refinery.md diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py b/generators/phonetic_transcription/phonetic_transcriptor/config.py similarity index 100% rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py rename to generators/phonetic_transcription/phonetic_transcriptor/config.py From bf2c4a3382c7f631ad119233b401f04ef851b2f2 Mon Sep 17 00:00:00 2001 From: springlaughing Date: Thu, 8 Feb 2024 21:37:00 +0100 Subject: [PATCH 4/4] Added requirements. --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9f557191..1b147465 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,6 @@ textacy==0.12.0 scikit-optimize==0.9.0 holidays==0.21.13 sumy==0.11.0 -tiktoken==0.4.0 \ No newline at end of file +tiktoken==0.4.0 +epitran==1.24 +jieba==0.42.1 \ No newline at end of file