Skip to content

Phonetic transcription #395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
nltk_ngram_generator
)

from .phonetic_transcription import phonetic_transcriptor

router = APIRouter()

for module in [
Expand Down Expand Up @@ -106,7 +108,8 @@
newline_splitter,
tiktoken_token_counter,
noun_splitter,
nltk_ngram_generator,
nltk_ngram_generator,
phonetic_transcriptor
]:
module_name = module.__name__.split(".")[-1]
model_name = (
Expand Down
12 changes: 12 additions & 0 deletions generators/phonetic_transcription/phonetic_transcriptor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

This epitran-based phonetic transcriptor generates phonetic transcriptions of words in a given text. Additionally, it utilizes the CEDICT dictionary database, which provides English definitions for Chinese characters, and the jieba tokenizer for the Chinese language.

Dependencies:
- Epitran: Library for transliterating orthographic text to IPA (International Phonetic Alphabet). Link: https://github.com/dmort27/epitran
- CEDICT: Comprehensive English-Chinese dictionary database. Link: https://www.mdbg.net/chinese/dictionary?page=cedict
- Jieba: Chinese text segmentation tool. Link: https://github.com/fxsjy/jieba

Usage:
To generate phonetic transcriptions, provide the text and a language code in the format of ISO 639-3 language codes and ISO 15924 script codes (e.g., "eng-Latn"). Supported languages and code formats can be found in the documentation for the epitran library: (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support).


Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from pydantic import BaseModel
import epitran
import jieba
import re

INPUT_EXAMPLE = {"text": "Bright violets grow along the stream.",
"language_code": "eng-Latn"}


class PhoneticTranscriptorModel(BaseModel):
text: str
language_code: str

class Config:
schema_extra = {"example": INPUT_EXAMPLE}


def phonetic_transcriptor(req: PhoneticTranscriptorModel):
"""Generates phonetic transcription of each word from a given text. """
text = req.text
language_code = req.language_code
# Tokenize based on language group
language_group = req.language_code.split('-')[1]
# Handle the case of Chinese lanugage separatly
if language_code in ('cmn-Hans', 'cmn-Hant'):
text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text))
# Use tokenizer for Chinese
tokens = [*jieba.cut(text_without_punctuation)]
# Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters
cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
epi = epitran.Epitran(language_code, cedict_file=cedict_path)
result = [epi.transliterate(token) for token in tokens]
return {"tokens": tokens,
"phonetic_transcriptions": result}

if language_group == 'Arab':
# Tokenize Arabic script using regex
tokens = re.findall(r'[\u0600-\u06FF]+', text)
elif language_group == 'Beng':
# Tokenize Bengali script using regex
tokens = re.findall(r'[\u0980-\u09FF]+', text)
elif language_group == 'Cyrl':
# Tokenize Cyrillic script using regex
tokens = re.findall(r'[\u0400-\u04FF]+', text)
elif language_group == 'Deva':
# Tokenize Devanagari script using regex
tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text)
elif language_group == 'Ethi':
# Tokenize Ethiopic script using regex
tokens = re.findall(r'[\u1200-\u137F]+', text)
elif language_group == 'Guru':
# Tokenize Gurmukhi script using regex
tokens = re.findall(r'[\u0A00-\u0A7F]+', text)
elif language_group == 'Khmr':
# Tokenize Khmer script using regex
tokens = re.findall(r'[\u1780-\u17FF]+', text)
elif language_group == 'Laoo':
# Tokenize Lao script using regex
tokens = re.findall(r'[\u0E80-\u0EFF]+', text)
elif language_group == 'Latn':
# Tokenize Latin script using
tokens = re.findall(r'\b\w+\b', text)
elif language_group == 'Mlym':
# Tokenize Malayalam script using
tokens = re.findall(r'[\u0D00-\u0D7F]+', text)
elif language_group == 'Mymr':
# Tokenize Burmese script using regex
tokens = re.findall(r'[\u1000-\u109F]+', text)
elif language_group == 'Orya':
# Tokenize Oriya script using regex
tokens = re.findall(r'[\u0B00-\u0B7F]+', text)
elif language_group == 'Sinh':
# Tokenize Sinhala script using regex
tokens = re.findall(r'[\u0D80-\u0DFF]+', text)
elif language_group == 'Syrc':
# Tokenize Syriac script using regex
tokens = re.findall(r'[\u0700-\u074F]+', text)
elif language_group == 'Taml':
# Tokenize Tamil script using regex
tokens = re.findall(r'[\u0B80-\u0BFF]+', text)
elif language_group == 'Telu':
# Tokenize Telugu script using regex
tokens = re.findall(r'[\u0C00-\u0C7F]+', text)
elif language_group == 'Thai':
# Tokenize Thai script using regex
tokens = re.findall(r'[\u0E00-\u0E7F]+', text)

epi = epitran.Epitran(language_code)
result = [epi.transliterate(token) for token in tokens]
return {"tokens": tokens,
"phonetic_transcriptions": result}

Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
```python
import epitran
import jieba
import re

# replace this list with a list containing your data
text = ["A gentle breeze sways the golden wheat."]

# add the texts to a dict called records. Add further information as key-value pairs if needed
record = {
"text": text,
"language_code": "eng-Latn"
}

def phonetic_transcriptor(record):

text = record["text"][0]
language_code = record["language_code"]
# Tokenize based on language group
language_group = language_code.split('-')[1]
# Handle the case of Chinese lanugage separatly
if language_code in ('cmn-Hans', 'cmn-Hant'):
text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text))
# Use tokenizer for Chinese
tokens = [*jieba.cut(text_without_punctuation)]
# Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters
cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
epi = epitran.Epitran(language_code, cedict_file=cedict_path)
result = [epi.transliterate(token) for token in tokens]
return {"tokens": tokens,
"phonetic_transcriptions": result}

if language_group == 'Arab':
# Tokenize Arabic script using regex
tokens = re.findall(r'[\u0600-\u06FF]+', text)
elif language_group == 'Beng':
# Tokenize Bengali script using regex
tokens = re.findall(r'[\u0980-\u09FF]+', text)
elif language_group == 'Cyrl':
# Tokenize Cyrillic script using regex
tokens = re.findall(r'[\u0400-\u04FF]+', text)
elif language_group == 'Deva':
# Tokenize Devanagari script using regex
tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text)
elif language_group == 'Ethi':
# Tokenize Ethiopic script using regex
tokens = re.findall(r'[\u1200-\u137F]+', text)
elif language_group == 'Guru':
# Tokenize Gurmukhi script using regex
tokens = re.findall(r'[\u0A00-\u0A7F]+', text)
elif language_group == 'Khmr':
# Tokenize Khmer script using regex
tokens = re.findall(r'[\u1780-\u17FF]+', text)
elif language_group == 'Laoo':
# Tokenize Lao script using regex
tokens = re.findall(r'[\u0E80-\u0EFF]+', text)
elif language_group == 'Latn':
# Tokenize Latin script using
tokens = re.findall(r'\b\w+\b', text)
elif language_group == 'Mlym':
# Tokenize Malayalam script using
tokens = re.findall(r'[\u0D00-\u0D7F]+', text)
elif language_group == 'Mymr':
# Tokenize Burmese script using regex
tokens = re.findall(r'[\u1000-\u109F]+', text)
elif language_group == 'Orya':
# Tokenize Oriya script using regex
tokens = re.findall(r'[\u0B00-\u0B7F]+', text)
elif language_group == 'Sinh':
# Tokenize Sinhala script using regex
tokens = re.findall(r'[\u0D80-\u0DFF]+', text)
elif language_group == 'Syrc':
# Tokenize Syriac script using regex
tokens = re.findall(r'[\u0700-\u074F]+', text)
elif language_group == 'Taml':
# Tokenize Tamil script using regex
tokens = re.findall(r'[\u0B80-\u0BFF]+', text)
elif language_group == 'Telu':
# Tokenize Telugu script using regex
tokens = re.findall(r'[\u0C00-\u0C7F]+', text)
elif language_group == 'Thai':
# Tokenize Thai script using regex
tokens = re.findall(r'[\u0E00-\u0E7F]+', text)

epi = epitran.Epitran(language_code)
result = [epi.transliterate(token) for token in tokens]
return {"tokens": tokens,
"phonetic_transcriptions": result}

```
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
```python
import epitran
import jieba
import re
from typing import Dict, List

def phonetic_transcriptor(text: str, language_code: str) -> Dict[str, List[str]]:
"""
Generate phonetic transcription of words in the given text.

@param text: Text for which to generate phonetic transcription.
@param language_code: language codes formatted as a combination of ISO 639-3 language codes and ISO 15924 script codes, e.g. "eng-Latn". Supported languages and code formats are as in epitran library this function relies on (https://github.com/dmort27/epitran?tab=readme-ov-file#language-support).

@return: A dictionary with two keys:
- "tokens": a list of words (or tokens) extracted from the input text.
- "phonetic_transcriptions": a corresponding list of phonetic transcriptions for each token.

Example:
>>> phonetic_transcriptor("hello world", "eng-Latn")
{'tokens': ['hello', 'world'], 'phonetic_transcriptions': ['həˈloʊ', 'wɜrld']}

Note: Unsupported language codes or invalid inputs may result in errors or empty transcriptions.
Performance may vary based on text length and complexity.
"""
# Tokenize based on language group
language_group = language_code.split('-')[1]
# Handle the case of Chinese lanugage separatly
if language_code in ('cmn-Hans', 'cmn-Hant'):
text_without_punctuation = ''.join(re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df]+', text))
# Use tokenizer for Chinese
tokens = [*jieba.cut(text_without_punctuation)]
# Provide path to CEDICT - dictionary database that provides English definitions for Chinese characters
cedict_path = "cedict_1_0_ts_utf-8_mdbg.txt"
epi = epitran.Epitran(language_code, cedict_file=cedict_path)
result = [epi.transliterate(token) for token in tokens]
return {"tokens": tokens,
"phonetic_transcriptions": result}

if language_group == 'Arab':
# Tokenize Arabic script using regex
tokens = re.findall(r'[\u0600-\u06FF]+', text)
elif language_group == 'Beng':
# Tokenize Bengali script using regex
tokens = re.findall(r'[\u0980-\u09FF]+', text)
elif language_group == 'Cyrl':
# Tokenize Cyrillic script using regex
tokens = re.findall(r'[\u0400-\u04FF]+', text)
elif language_group == 'Deva':
# Tokenize Devanagari script using regex
tokens = re.findall(r'[\u0900-\u0963\u0966-\u097F]+', text)
elif language_group == 'Ethi':
# Tokenize Ethiopic script using regex
tokens = re.findall(r'[\u1200-\u137F]+', text)
elif language_group == 'Guru':
# Tokenize Gurmukhi script using regex
tokens = re.findall(r'[\u0A00-\u0A7F]+', text)
elif language_group == 'Khmr':
# Tokenize Khmer script using regex
tokens = re.findall(r'[\u1780-\u17FF]+', text)
elif language_group == 'Laoo':
# Tokenize Lao script using regex
tokens = re.findall(r'[\u0E80-\u0EFF]+', text)
elif language_group == 'Latn':
# Tokenize Latin script using
tokens = re.findall(r'\b\w+\b', text)
elif language_group == 'Mlym':
# Tokenize Malayalam script using
tokens = re.findall(r'[\u0D00-\u0D7F]+', text)
elif language_group == 'Mymr':
# Tokenize Burmese script using regex
tokens = re.findall(r'[\u1000-\u109F]+', text)
elif language_group == 'Orya':
# Tokenize Oriya script using regex
tokens = re.findall(r'[\u0B00-\u0B7F]+', text)
elif language_group == 'Sinh':
# Tokenize Sinhala script using regex
tokens = re.findall(r'[\u0D80-\u0DFF]+', text)
elif language_group == 'Syrc':
# Tokenize Syriac script using regex
tokens = re.findall(r'[\u0700-\u074F]+', text)
elif language_group == 'Taml':
# Tokenize Tamil script using regex
tokens = re.findall(r'[\u0B80-\u0BFF]+', text)
elif language_group == 'Telu':
# Tokenize Telugu script using regex
tokens = re.findall(r'[\u0C00-\u0C7F]+', text)
elif language_group == 'Thai':
# Tokenize Thai script using regex
tokens = re.findall(r'[\u0E00-\u0E7F]+', text)

epi = epitran.Epitran(language_code)
result = [epi.transliterate(token) for token in tokens]
return {"tokens": tokens,
"phonetic_transcriptions": result}

# ↑ necessary bricks function
# -----------------------------------------------------------------------------------------
# ↓ example implementation

def example_integration():
texts = [
"Clouds drift across the clear blue sky.",
"Wolken ziehen über den klaren blauen Himmel.",
"云彩在晴朗的蓝天中飘过。",
"बादल साफ़ नीले आकाश में तैरते हैं।"]
language_codes = ["eng-Latn", "deu-Latn-np", "cmn-Hans", "hin-Deva"]
for text, language_code in zip (texts, language_codes):
print(f"Phonetic transcription for \"{text}\" is: {phonetic_transcriptor(text, language_code)}")

example_integration()
```
Loading