From 483b83ce30119247e7ed401bdd572d9fdeda483a Mon Sep 17 00:00:00 2001
From: springlaughing <springlaughing@LAPTOP-AOI59UVP>
Date: Thu, 8 Feb 2024 03:47:25 +0100
Subject: [PATCH 1/4] New directory in generators added, containing files for
 new phonetic_transcription brick

---
 generators/phonetic_transcription/README.md   |  1 +
 generators/phonetic_transcription/__init__.py | 92 ++++++++++++++++++
 .../code_snippet_backup.md                    | 90 ++++++++++++++++++
 .../code_snippet_common.md                    | 94 +++++++++++++++++++
 .../code_snippet_refinery.md                  | 79 ++++++++++++++++
 generators/phonetic_transcription/config.py   | 33 +++++++
 6 files changed, 389 insertions(+)
 create mode 100644 generators/phonetic_transcription/README.md
 create mode 100644 generators/phonetic_transcription/__init__.py
 create mode 100644 generators/phonetic_transcription/code_snippet_backup.md
 create mode 100644 generators/phonetic_transcription/code_snippet_common.md
 create mode 100644 generators/phonetic_transcription/code_snippet_refinery.md
 create mode 100644 generators/phonetic_transcription/config.py

diff --git a/generators/phonetic_transcription/README.md b/generators/phonetic_transcription/README.md
new file mode 100644
index 00000000..92bb4639
--- /dev/null
+++ b/generators/phonetic_transcription/README.md
@@ -0,0 +1 @@
+Reduces all tokens in a text to their base form with the use of a vocabulary and morphological analysis of the tokens. Uses a spaCy model, see official documentation here: https://spacy.io/api/lemmatizer
\ No newline at end of file
diff --git a/generators/phonetic_transcription/__init__.py b/generators/phonetic_transcription/__init__.py
new file mode 100644
index 00000000..7dfa9e06
--- /dev/null
+++ b/generators/phonetic_transcription/__init__.py
@@ -0,0 +1,92 @@
+from pydantic import BaseModel
+import epitran
+import jieba
+import re
+
+INPUT_EXAMPLE = {"text": "Bright violets grow along the stream.",
+                 "language_code": "eng-Latn"}
+
+
+class PhoneticTranscription(BaseModel):
+    text: str
+    language_code: str
+
+    class Config:
+        schema_extra = {"example": INPUT_EXAMPLE}
+
+
+def phonetic_transcriptor(req: PhoneticTranscription):
+    """Generates phonetic transcription of each word from a given text. """
+    text = req.text
+    language_code = req.language_code
+    # Tokenize based on language group
+    language_group = req.language_code.split('-')[1]
+    # Handle the case of Chinese lanugage separatly
+    if language_code in ('cmn-Hans', 'cmn-Hant'):
+        text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text))
+        # Use tokenizer for Chinese
+        tokens = [*jieba.cut(text_without_punctuation)]
+        # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters
+        cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
+        epi = epitran.Epitran(language_code, cedict_file=cedict_path)             
+        result = [epi.transliterate(token) for token in tokens]
+        return {"tokens": tokens,
+                "phonetic_transcriptions": result}
+
+    if language_group == 'Arab':
+        # Tokenize Arabic script using regex
+        tokens = re.findall(r'[\u0600-\u06FF]+', text)
+    elif language_group == 'Beng':
+        # Tokenize Bengali script using regex
+        tokens = re.findall(r'[\u0980-\u09FF]+', text)
+    elif language_group == 'Cyrl':
+        # Tokenize Cyrillic script using regex
+        tokens = re.findall(r'[\u0400-\u04FF]+', text)
+    elif language_group == 'Deva':
+        # Tokenize Devanagari script using regex
+        tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text)
+    elif language_group == 'Ethi':
+        # Tokenize Ethiopic script using regex
+        tokens = re.findall(r'[\u1200-\u137F]+', text)
+    elif language_group == 'Guru':
+        # Tokenize Gurmukhi script using regex
+        tokens = re.findall(r'[\u0A00-\u0A7F]+', text)
+    elif language_group == 'Khmr':
+        # Tokenize Khmer script using regex
+        tokens = re.findall(r'[\u1780-\u17FF]+', text)
+    elif language_group == 'Laoo':
+        # Tokenize Lao script using regex
+        tokens = re.findall(r'[\u0E80-\u0EFF]+', text)
+    elif language_group == 'Latn':
+        # Tokenize Latin script using 
+        tokens = re.findall(r'\b\w+\b', text)
+    elif language_group == 'Mlym':
+        # Tokenize Malayalam script using 
+        tokens = re.findall(r'[\u0D00-\u0D7F]+', text)
+    elif language_group == 'Mymr':
+        # Tokenize Burmese script using regex
+        tokens = re.findall(r'[\u1000-\u109F]+', text)
+    elif language_group == 'Orya':
+        # Tokenize Oriya script using regex
+        tokens = re.findall(r'[\u0B00-\u0B7F]+', text)
+    elif language_group == 'Sinh':
+        # Tokenize Sinhala script using regex
+        tokens = re.findall(r'[\u0D80-\u0DFF]+', text)
+    elif language_group == 'Syrc':
+        # Tokenize Syriac script using regex
+        tokens = re.findall(r'[\u0700-\u074F]+', text)
+    elif language_group == 'Taml':
+        # Tokenize Tamil script using regex
+        tokens = re.findall(r'[\u0B80-\u0BFF]+', text)
+    elif language_group == 'Telu':
+        # Tokenize Telugu script using regex
+        tokens = re.findall(r'[\u0C00-\u0C7F]+', text)
+    elif language_group == 'Thai':
+        # Tokenize Thai script using regex
+        tokens = re.findall(r'[\u0E00-\u0E7F]+', text)
+    
+    epi = epitran.Epitran(language_code)
+    result = [epi.transliterate(token) for token in tokens]
+    return {"tokens": tokens,
+            "phonetic_transcriptions": result}
+    
diff --git a/generators/phonetic_transcription/code_snippet_backup.md b/generators/phonetic_transcription/code_snippet_backup.md
new file mode 100644
index 00000000..6ac95ebe
--- /dev/null
+++ b/generators/phonetic_transcription/code_snippet_backup.md
@@ -0,0 +1,90 @@
+```python
+import epitran
+import jieba
+import re
+
+# replace this list with a list containing your data
+text = ["A gentle breeze sways the golden wheat."]
+
+# add the texts to a dict called records. Add further information as key-value pairs if needed
+record = {
+    "text": text,
+    "language_code": "eng-Latn"
+}
+
+def phonetic_transcriptor(record):
+    
+    text = record["text"][0]
+    language_code = record["language_code"]
+    # Tokenize based on language group
+    language_group = language_code.split('-')[1]
+    # Handle the case of Chinese lanugage separatly
+    if language_code in ('cmn-Hans', 'cmn-Hant'):
+        text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text))
+        # Use tokenizer for Chinese
+        tokens = [*jieba.cut(text_without_punctuation)]
+        # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters
+        cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
+        epi = epitran.Epitran(language_code, cedict_file=cedict_path)             
+        result = [epi.transliterate(token) for token in tokens]
+        return {"tokens": tokens,
+                "phonetic_transcriptions": result}
+
+    if language_group == 'Arab':
+        # Tokenize Arabic script using regex
+        tokens = re.findall(r'[\u0600-\u06FF]+', text)
+    elif language_group == 'Beng':
+        # Tokenize Bengali script using regex
+        tokens = re.findall(r'[\u0980-\u09FF]+', text)
+    elif language_group == 'Cyrl':
+        # Tokenize Cyrillic script using regex
+        tokens = re.findall(r'[\u0400-\u04FF]+', text)
+    elif language_group == 'Deva':
+        # Tokenize Devanagari script using regex
+        tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text)
+    elif language_group == 'Ethi':
+        # Tokenize Ethiopic script using regex
+        tokens = re.findall(r'[\u1200-\u137F]+', text)
+    elif language_group == 'Guru':
+        # Tokenize Gurmukhi script using regex
+        tokens = re.findall(r'[\u0A00-\u0A7F]+', text)
+    elif language_group == 'Khmr':
+        # Tokenize Khmer script using regex
+        tokens = re.findall(r'[\u1780-\u17FF]+', text)
+    elif language_group == 'Laoo':
+        # Tokenize Lao script using regex
+        tokens = re.findall(r'[\u0E80-\u0EFF]+', text)
+    elif language_group == 'Latn':
+        # Tokenize Latin script using 
+        tokens = re.findall(r'\b\w+\b', text)
+    elif language_group == 'Mlym':
+        # Tokenize Malayalam script using 
+        tokens = re.findall(r'[\u0D00-\u0D7F]+', text)
+    elif language_group == 'Mymr':
+        # Tokenize Burmese script using regex
+        tokens = re.findall(r'[\u1000-\u109F]+', text)
+    elif language_group == 'Orya':
+        # Tokenize Oriya script using regex
+        tokens = re.findall(r'[\u0B00-\u0B7F]+', text)
+    elif language_group == 'Sinh':
+        # Tokenize Sinhala script using regex
+        tokens = re.findall(r'[\u0D80-\u0DFF]+', text)
+    elif language_group == 'Syrc':
+        # Tokenize Syriac script using regex
+        tokens = re.findall(r'[\u0700-\u074F]+', text)
+    elif language_group == 'Taml':
+        # Tokenize Tamil script using regex
+        tokens = re.findall(r'[\u0B80-\u0BFF]+', text)
+    elif language_group == 'Telu':
+        # Tokenize Telugu script using regex
+        tokens = re.findall(r'[\u0C00-\u0C7F]+', text)
+    elif language_group == 'Thai':
+        # Tokenize Thai script using regex
+        tokens = re.findall(r'[\u0E00-\u0E7F]+', text)
+    
+    epi = epitran.Epitran(language_code)
+    result = [epi.transliterate(token) for token in tokens]
+    return {"tokens": tokens,
+            "phonetic_transcriptions": result}
+    
+```
diff --git a/generators/phonetic_transcription/code_snippet_common.md b/generators/phonetic_transcription/code_snippet_common.md
new file mode 100644
index 00000000..022cd7f4
--- /dev/null
+++ b/generators/phonetic_transcription/code_snippet_common.md
@@ -0,0 +1,94 @@
+```python
+import epitran
+import jieba
+import re
+from typing import Dict, List
+
+def phonetic_transcriptor(text: str, language_code: str) -> Dict[str, List[str]]:
+    # Tokenize based on language group
+    language_group = language_code.split('-')[1]
+    # Handle the case of Chinese lanugage separatly 
+    if language_code in ('cmn-Hans', 'cmn-Hant'):
+        text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text))
+        # Use tokenizer for Chinese
+        tokens = [*jieba.cut(text_without_punctuation)]
+        # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters
+        cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
+        epi = epitran.Epitran(language_code, cedict_file=cedict_path)             
+        result = [epi.transliterate(token) for token in tokens]
+        return {"tokens": tokens,
+                "phonetic_transcriptions": result}
+
+    if language_group == 'Arab':
+        # Tokenize Arabic script using regex
+        tokens = re.findall(r'[\u0600-\u06FF]+', text)
+    elif language_group == 'Beng':
+        # Tokenize Bengali script using regex
+        tokens = re.findall(r'[\u0980-\u09FF]+', text)
+    elif language_group == 'Cyrl':
+        # Tokenize Cyrillic script using regex
+        tokens = re.findall(r'[\u0400-\u04FF]+', text)
+    elif language_group == 'Deva':
+        # Tokenize Devanagari script using regex
+        tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text)
+    elif language_group == 'Ethi':
+        # Tokenize Ethiopic script using regex
+        tokens = re.findall(r'[\u1200-\u137F]+', text)
+    elif language_group == 'Guru':
+        # Tokenize Gurmukhi script using regex
+        tokens = re.findall(r'[\u0A00-\u0A7F]+', text)
+    elif language_group == 'Khmr':
+        # Tokenize Khmer script using regex
+        tokens = re.findall(r'[\u1780-\u17FF]+', text)
+    elif language_group == 'Laoo':
+        # Tokenize Lao script using regex
+        tokens = re.findall(r'[\u0E80-\u0EFF]+', text)
+    elif language_group == 'Latn':
+        # Tokenize Latin script using 
+        tokens = re.findall(r'\b\w+\b', text)
+    elif language_group == 'Mlym':
+        # Tokenize Malayalam script using 
+        tokens = re.findall(r'[\u0D00-\u0D7F]+', text)
+    elif language_group == 'Mymr':
+        # Tokenize Burmese script using regex
+        tokens = re.findall(r'[\u1000-\u109F]+', text)
+    elif language_group == 'Orya':
+        # Tokenize Oriya script using regex
+        tokens = re.findall(r'[\u0B00-\u0B7F]+', text)
+    elif language_group == 'Sinh':
+        # Tokenize Sinhala script using regex
+        tokens = re.findall(r'[\u0D80-\u0DFF]+', text)
+    elif language_group == 'Syrc':
+        # Tokenize Syriac script using regex
+        tokens = re.findall(r'[\u0700-\u074F]+', text)
+    elif language_group == 'Taml':
+        # Tokenize Tamil script using regex
+        tokens = re.findall(r'[\u0B80-\u0BFF]+', text)
+    elif language_group == 'Telu':
+        # Tokenize Telugu script using regex
+        tokens = re.findall(r'[\u0C00-\u0C7F]+', text)
+    elif language_group == 'Thai':
+        # Tokenize Thai script using regex
+        tokens = re.findall(r'[\u0E00-\u0E7F]+', text)
+    
+    epi = epitran.Epitran(language_code)
+    result = [epi.transliterate(token) for token in tokens]
+    return {"tokens": tokens,
+            "phonetic_transcriptions": result}
+    
+# ↑ necessary bricks function 
+# -----------------------------------------------------------------------------------------
+# ↓ example implementation 
+
+def example_integration():
+    texts = [
+        "Clouds drift across the clear blue sky.",
+        "Wolken ziehen über den klaren blauen Himmel.",
+        "云彩在晴朗的蓝天中飘过。",
+        "बादल साफ़ नीले आकाश में तैरते हैं।"]
+    language_codes = ["eng-Latn", "deu-Latn-np", "cmn-Hans", "hin-Deva"]
+    for text, language_code in zip (texts, language_codes):
+        print(f"Phonetic transcription for \"{text}\" is: {phonetic_transcriptor(text, language_code)}")
+
+example_integration()
+```
diff --git a/generators/phonetic_transcription/code_snippet_refinery.md b/generators/phonetic_transcription/code_snippet_refinery.md
new file mode 100644
index 00000000..2c29922d
--- /dev/null
+++ b/generators/phonetic_transcription/code_snippet_refinery.md
@@ -0,0 +1,79 @@
+```python
+ATTRIBUTE: str = "text"
+LANGUAGE_CODE: str = "eng-Latn"
+
+def phonetic_transcriptor(record):
+    
+    text = record["ATTRIBUTE"].text
+    # Tokenize based on language group
+    language_group = LANGUAGE_CODE.split('-')[1]
+    # Handle the case of Chinese lanugage separatly
+    if LANGUAGE_CODE in ('cmn-Hans', 'cmn-Hant'):
+        text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text))
+        # Use tokenizer for Chinese
+        tokens = [*jieba.cut(text_without_punctuation)]
+        # Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters
+        cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
+        epi = epitran.Epitran(LANGUAGE_CODE, cedict_file=cedict_path)             
+        result = [epi.transliterate(token) for token in tokens]
+        return {"tokens": tokens,
+                "phonetic_transcriptions": result}
+
+    if language_group == 'Arab':
+        # Tokenize Arabic script using regex
+        tokens = re.findall(r'[\u0600-\u06FF]+', text)
+    elif language_group == 'Beng':
+        # Tokenize Bengali script using regex
+        tokens = re.findall(r'[\u0980-\u09FF]+', text)
+    elif language_group == 'Cyrl':
+        # Tokenize Cyrillic script using regex
+        tokens = re.findall(r'[\u0400-\u04FF]+', text)
+    elif language_group == 'Deva':
+        # Tokenize Devanagari script using regex
+        tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text)
+    elif language_group == 'Ethi':
+        # Tokenize Ethiopic script using regex
+        tokens = re.findall(r'[\u1200-\u137F]+', text)
+    elif language_group == 'Guru':
+        # Tokenize Gurmukhi script using regex
+        tokens = re.findall(r'[\u0A00-\u0A7F]+', text)
+    elif language_group == 'Khmr':
+        # Tokenize Khmer script using regex
+        tokens = re.findall(r'[\u1780-\u17FF]+', text)
+    elif language_group == 'Laoo':
+        # Tokenize Lao script using regex
+        tokens = re.findall(r'[\u0E80-\u0EFF]+', text)
+    elif language_group == 'Latn':
+        # Tokenize Latin script using 
+        tokens = re.findall(r'\b\w+\b', text)
+    elif language_group == 'Mlym':
+        # Tokenize Malayalam script using 
+        tokens = re.findall(r'[\u0D00-\u0D7F]+', text)
+    elif language_group == 'Mymr':
+        # Tokenize Burmese script using regex
+        tokens = re.findall(r'[\u1000-\u109F]+', text)
+    elif language_group == 'Orya':
+        # Tokenize Oriya script using regex
+        tokens = re.findall(r'[\u0B00-\u0B7F]+', text)
+    elif language_group == 'Sinh':
+        # Tokenize Sinhala script using regex
+        tokens = re.findall(r'[\u0D80-\u0DFF]+', text)
+    elif language_group == 'Syrc':
+        # Tokenize Syriac script using regex
+        tokens = re.findall(r'[\u0700-\u074F]+', text)
+    elif language_group == 'Taml':
+        # Tokenize Tamil script using regex
+        tokens = re.findall(r'[\u0B80-\u0BFF]+', text)
+    elif language_group == 'Telu':
+        # Tokenize Telugu script using regex
+        tokens = re.findall(r'[\u0C00-\u0C7F]+', text)
+    elif language_group == 'Thai':
+        # Tokenize Thai script using regex
+        tokens = re.findall(r'[\u0E00-\u0E7F]+', text)
+    
+    epi = epitran.Epitran(LANGUAGE_CODE)
+    result = [epi.transliterate(token) for token in tokens]
+    return {"tokens": tokens,
+            "phonetic_transcriptions": result}
+    
+```
diff --git a/generators/phonetic_transcription/config.py b/generators/phonetic_transcription/config.py
new file mode 100644
index 00000000..ab611aa7
--- /dev/null
+++ b/generators/phonetic_transcription/config.py
@@ -0,0 +1,33 @@
+from util.configs import build_generator_function_config
+from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
+from . import spacy_lemmatizer, INPUT_EXAMPLE
+
+
+def get_config():
+    return build_generator_function_config(
+        function=spacy_lemmatizer,
+        input_example=INPUT_EXAMPLE,
+        issue_id=228,
+        tabler_icon="Transform",
+        min_refinery_version="1.7.0",
+        state=State.PUBLIC.value,
+        type="python_function",
+        available_for=["refinery", "common"],
+        part_of_group=[
+            "lemmatizer",
+        ],  # first entry should be parent directory
+        # bricks integrator information
+        integrator_inputs={
+            "name": "spacy_lemmatizer",
+            "refineryDataType": RefineryDataType.TEXT.value,
+            "variables": {
+                "ATTRIBUTE": {
+                    "selectionType": SelectionType.CHOICE.value,
+                    "addInfo": [
+                        BricksVariableType.ATTRIBUTE.value,
+                        BricksVariableType.GENERIC_STRING.value,
+                    ],
+                }
+            },
+        },
+    )

From b5ac38507f8731e068dc37716e772fe2c30707c5 Mon Sep 17 00:00:00 2001
From: springlaughing <springlaughing@LAPTOP-AOI59UVP>
Date: Thu, 8 Feb 2024 20:50:57 +0100
Subject: [PATCH 2/4] Added phonetic_transcriptor, init, backup, common,
 refinery, config, readme files. This will be bricks issue 278.

---
 generators/phonetic_transcription/README.md    |  1 -
 .../epitran_phonetic_transcriptor/README.md    | 12 ++++++++++++
 .../__init__.py                                |  0
 .../code_snippet_backup.md                     |  0
 .../code_snippet_common.md                     | 17 +++++++++++++++++
 .../code_snippet_refinery.md                   | 13 +++++++++----
 .../config.py                                  | 18 ++++++++++++------
 7 files changed, 50 insertions(+), 11 deletions(-)
 delete mode 100644 generators/phonetic_transcription/README.md
 create mode 100644 generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md
 rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/__init__.py (100%)
 rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/code_snippet_backup.md (100%)
 rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/code_snippet_common.md (80%)
 rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/code_snippet_refinery.md (92%)
 rename generators/phonetic_transcription/{ => epitran_phonetic_transcriptor}/config.py (66%)

diff --git a/generators/phonetic_transcription/README.md b/generators/phonetic_transcription/README.md
deleted file mode 100644
index 92bb4639..00000000
--- a/generators/phonetic_transcription/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Reduces all tokens in a text to their base form with the use of a vocabulary and morphological analysis of the tokens. Uses a spaCy model, see official documentation here: https://spacy.io/api/lemmatizer
\ No newline at end of file
diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md
new file mode 100644
index 00000000..4589e368
--- /dev/null
+++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md
@@ -0,0 +1,12 @@
+
+This epitran-based phonetic transcriptor generates phonetic transcriptions of words in a given text. Additionally, it utilizes the CEDICT dictionary database, which provides English definitions for Chinese characters, and the jieba tokenizer for the Chinese language.  
+
+Dependencies:
+- Epitran: Library for transliterating orthographic text to IPA (International Phonetic Alphabet). Link: https://github.com/dmort27/epitran
+- CEDICT: Comprehensive English-Chinese dictionary database. Link: https://www.mdbg.net/chinese/dictionary?page=cedict
+- Jieba: Chinese text segmentation tool. Link: https://github.com/fxsjy/jieba
+
+Usage:
+To generate phonetic transcriptions, provide the text and a language code in the format of ISO 639-3 language codes and ISO 15924 script codes (e.g., "eng-Latn"). Supported languages and code formats can be found in the documentation for the epitran library: (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support). 
+
+
diff --git a/generators/phonetic_transcription/__init__.py b/generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py
similarity index 100%
rename from generators/phonetic_transcription/__init__.py
rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py
diff --git a/generators/phonetic_transcription/code_snippet_backup.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md
similarity index 100%
rename from generators/phonetic_transcription/code_snippet_backup.md
rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md
diff --git a/generators/phonetic_transcription/code_snippet_common.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md
similarity index 80%
rename from generators/phonetic_transcription/code_snippet_common.md
rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md
index 022cd7f4..58b27b9f 100644
--- a/generators/phonetic_transcription/code_snippet_common.md
+++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md
@@ -5,6 +5,23 @@ import re
 from typing import Dict, List
 
 def phonetic_transcriptor(text: str, language_code: str) -> Dict[str, List[str]]:
+    """
+    Generate phonetic transcription of words in the given text.
+
+    @param text: Text for which to generate phonetic transcription.
+    @param language_code: language codes formatted as a combination of ISO 639-3 language codes and ISO 15924 script codes, e.g. "eng-Latn". Supported languages and code formats are as in epitran library this function relies on (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support). 
+    
+    @return: A dictionary with two keys:
+        - "tokens": a list of words (or tokens) extracted from the input text.
+        - "phonetic_transcriptions": a corresponding list of phonetic transcriptions for each token.
+    
+    Example:
+    >>> phonetic_transcriptor("hello world", "eng-Latn")
+    {'tokens': ['hello', 'world'], 'phonetic_transcriptions': ['həˈloʊ', 'wɜrld']}
+    
+    Note: Unsupported language codes or invalid inputs may result in errors or empty transcriptions.
+    Performance may vary based on text length and complexity.
+    """
     # Tokenize based on language group
     language_group = language_code.split('-')[1]
     # Handle the case of Chinese lanugage separatly 
diff --git a/generators/phonetic_transcription/code_snippet_refinery.md b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md
similarity index 92%
rename from generators/phonetic_transcription/code_snippet_refinery.md
rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md
index 2c29922d..a3433652 100644
--- a/generators/phonetic_transcription/code_snippet_refinery.md
+++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md
@@ -1,4 +1,9 @@
 ```python
+import epitran
+import jieba
+import re
+import json
+
 ATTRIBUTE: str = "text"
 LANGUAGE_CODE: str = "eng-Latn"
 
@@ -16,8 +21,8 @@ def phonetic_transcriptor(record):
         cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
         epi = epitran.Epitran(LANGUAGE_CODE, cedict_file=cedict_path)             
         result = [epi.transliterate(token) for token in tokens]
-        return {"tokens": tokens,
-                "phonetic_transcriptions": result}
+        return json.dumps({"tokens": tokens,
+                "phonetic_transcriptions": result}, ensure_ascii=False)
 
     if language_group == 'Arab':
         # Tokenize Arabic script using regex
@@ -73,7 +78,7 @@ def phonetic_transcriptor(record):
     
     epi = epitran.Epitran(LANGUAGE_CODE)
     result = [epi.transliterate(token) for token in tokens]
-    return {"tokens": tokens,
-            "phonetic_transcriptions": result}
+    return json.dumps({"tokens": tokens,
+            "phonetic_transcriptions": result}, ensure_ascii=False)
     
 ```
diff --git a/generators/phonetic_transcription/config.py b/generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py
similarity index 66%
rename from generators/phonetic_transcription/config.py
rename to generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py
index ab611aa7..30dfffce 100644
--- a/generators/phonetic_transcription/config.py
+++ b/generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py
@@ -1,24 +1,24 @@
 from util.configs import build_generator_function_config
 from util.enums import State, RefineryDataType, BricksVariableType, SelectionType
-from . import spacy_lemmatizer, INPUT_EXAMPLE
+from . import phonetic_transcriptor, INPUT_EXAMPLE
 
 
 def get_config():
     return build_generator_function_config(
-        function=spacy_lemmatizer,
+        function=phonetic_transcriptor,
         input_example=INPUT_EXAMPLE,
-        issue_id=228,
-        tabler_icon="Transform",
+        issue_id=278,
+        tabler_icon="AlphabetGreek",
         min_refinery_version="1.7.0",
         state=State.PUBLIC.value,
         type="python_function",
         available_for=["refinery", "common"],
         part_of_group=[
-            "lemmatizer",
+            "phonetic_transcription",
         ],  # first entry should be parent directory
         # bricks integrator information
         integrator_inputs={
-            "name": "spacy_lemmatizer",
+            "name": "phonetic_transcriptor",
             "refineryDataType": RefineryDataType.TEXT.value,
             "variables": {
                 "ATTRIBUTE": {
@@ -27,6 +27,12 @@ def get_config():
                         BricksVariableType.ATTRIBUTE.value,
                         BricksVariableType.GENERIC_STRING.value,
                     ],
+                },
+                "LANGUAGE_CODE": {
+                    "selectionType": SelectionType.STRING.value,
+                    "addInfo": [
+                        BricksVariableType.GENERIC_STRING.value
+                    ]
                 }
             },
         },

From 3ef8879d2b495b0a22b4966e86b694547b13c3b5 Mon Sep 17 00:00:00 2001
From: springlaughing <springlaughing@LAPTOP-AOI59UVP>
Date: Thu, 8 Feb 2024 21:18:56 +0100
Subject: [PATCH 3/4] Tested with FastAPI.

---
 generators/__init__.py                                       | 5 ++++-
 .../README.md                                                | 0
 .../__init__.py                                              | 4 ++--
 .../code_snippet_backup.md                                   | 0
 .../code_snippet_common.md                                   | 0
 .../code_snippet_refinery.md                                 | 0
 .../config.py                                                | 0
 7 files changed, 6 insertions(+), 3 deletions(-)
 rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/README.md (100%)
 rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/__init__.py (97%)
 rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/code_snippet_backup.md (100%)
 rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/code_snippet_common.md (100%)
 rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/code_snippet_refinery.md (100%)
 rename generators/phonetic_transcription/{epitran_phonetic_transcriptor => phonetic_transcriptor}/config.py (100%)

diff --git a/generators/__init__.py b/generators/__init__.py
index 43db4b7b..b3cb7845 100644
--- a/generators/__init__.py
+++ b/generators/__init__.py
@@ -69,6 +69,8 @@
     nltk_ngram_generator
 )
 
+from .phonetic_transcription import phonetic_transcriptor
+
 router = APIRouter()
 
 for module in [
@@ -106,7 +108,8 @@
     newline_splitter,
     tiktoken_token_counter,
     noun_splitter,
-    nltk_ngram_generator,  
+    nltk_ngram_generator,
+    phonetic_transcriptor
 ]:
     module_name = module.__name__.split(".")[-1]
     model_name = (
diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md b/generators/phonetic_transcription/phonetic_transcriptor/README.md
similarity index 100%
rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/README.md
rename to generators/phonetic_transcription/phonetic_transcriptor/README.md
diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py b/generators/phonetic_transcription/phonetic_transcriptor/__init__.py
similarity index 97%
rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py
rename to generators/phonetic_transcription/phonetic_transcriptor/__init__.py
index 7dfa9e06..b678440f 100644
--- a/generators/phonetic_transcription/epitran_phonetic_transcriptor/__init__.py
+++ b/generators/phonetic_transcription/phonetic_transcriptor/__init__.py
@@ -7,7 +7,7 @@
                  "language_code": "eng-Latn"}
 
 
-class PhoneticTranscription(BaseModel):
+class PhoneticTranscriptorModel(BaseModel):
     text: str
     language_code: str
 
@@ -15,7 +15,7 @@ class Config:
         schema_extra = {"example": INPUT_EXAMPLE}
 
 
-def phonetic_transcriptor(req: PhoneticTranscription):
+def phonetic_transcriptor(req: PhoneticTranscriptorModel):
     """Generates phonetic transcription of each word from a given text. """
     text = req.text
     language_code = req.language_code
diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_backup.md
similarity index 100%
rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_backup.md
rename to generators/phonetic_transcription/phonetic_transcriptor/code_snippet_backup.md
diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_common.md
similarity index 100%
rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_common.md
rename to generators/phonetic_transcription/phonetic_transcriptor/code_snippet_common.md
diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md b/generators/phonetic_transcription/phonetic_transcriptor/code_snippet_refinery.md
similarity index 100%
rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/code_snippet_refinery.md
rename to generators/phonetic_transcription/phonetic_transcriptor/code_snippet_refinery.md
diff --git a/generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py b/generators/phonetic_transcription/phonetic_transcriptor/config.py
similarity index 100%
rename from generators/phonetic_transcription/epitran_phonetic_transcriptor/config.py
rename to generators/phonetic_transcription/phonetic_transcriptor/config.py

From bf2c4a3382c7f631ad119233b401f04ef851b2f2 Mon Sep 17 00:00:00 2001
From: springlaughing <springlaughing@LAPTOP-AOI59UVP>
Date: Thu, 8 Feb 2024 21:37:00 +0100
Subject: [PATCH 4/4] Added requirements.

---
 requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 9f557191..1b147465 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,4 +28,6 @@ textacy==0.12.0
 scikit-optimize==0.9.0
 holidays==0.21.13
 sumy==0.11.0
-tiktoken==0.4.0
\ No newline at end of file
+tiktoken==0.4.0
+epitran==1.24
+jieba==0.42.1
\ No newline at end of file