Commit 52a974e2 authored by Nikhilesh Bhatnagar's avatar Nikhilesh Bhatnagar

Initial implementation.

parents
Pipeline #28 canceled with stages
wavs
ort_models
Fastspeech2_HS
\ No newline at end of file
FROM nvcr.io/nvidia/tritonserver:23.10-py3
CMD ["tritonserver", "--model-repository=/models", "--cache-config=local,size=1048576"]
\ No newline at end of file
# IIT Madras TTS Triton repo for dhruva
* Make sure to have `nvidia-docker` installed.
* `bash make_triton_repo.sh`
* It will form the triton repo folder at `triton_model_repo`.
* You may change the tritonserver container tag depending on your driver version.
* Check the notebooks for environment details. We haven't provided the exported yml file because it pulls and patches the latest espnet git, for that, look at the scripts mentioned in the notebooks.
* Build the image `docker build -t dhruva/iitmtts-model-server:1 .`
* Run the container `nvidia-docker run --gpus=all --rm --shm-size 5g --network=host --name dhruva-iitmtts-triton-server -v./triton_models_repo:/models dhruva/iitmtts-model-server:1`
### TODO:
* Batched inference unsupported
* Followed by ensemble-fication
\ No newline at end of file
FROM nvcr.io/nvidia/tritonserver:23.10-py3
ARG UID=1000
ARG GID=1000
RUN groupadd --system --force --gid ${GID} builder && id -u ${UID} &>/dev/null || useradd --system --gid ${GID} --uid ${UID} builder
RUN pip install -U certifi
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y cmake build-essential pkg-config libgoogle-perftools-dev unzip rapidjson-dev ca-certificates locales && locale-gen en_US.UTF-8 && update-locale LANG=en_US.UTF-8
USER ${UID}
WORKDIR /home/builder
\ No newline at end of file
#!/bin/bash
git clone --recursive https://github.com/smtiitm/Fastspeech2_HS.git && cd Fastspeech2_HS && git lfs fetch --all && git lfs pull && git apply ../patches/fastspeech2.patch && cd ..
docker build --build-arg="UID=$(id -u)" --build-arg="GID=$(id -g)" -t dhruva/iitm-tts-envbuilder . -f envbuilder.dockerfile
mkdir onnx_models
nvidia-docker run --gpus=all -it --rm --name iitm-tts-dhruva-builder -v ./Fastspeech2_HS:/Fastspeech2_HS -v ./onnx_models:/onnx_models -v ./notebooks:/notebooks -v ./patches:/patches -v ./scripts:/scripts dhruva/iitm-tts-envbuilder bash /scripts/perform_onnx_conversion.sh
mkdir triton_model_repo
nvidia-docker run --gpus=all -it --rm --name iitm-tts-dhruva-builder -v ./patches:/patches -v ./triton_models/tts:/model -v ./triton_model_repo:/model_repo dhruva/iitm-tts-envbuilder bash /model/envbuilder.sh
cp Fastspeech2_HS/text_preprocess_for_inference.py triton_model_repo/tts/1
cp -r triton_models/tts/config.pbtxt triton_models/tts/1 triton_model_repo/tts
cp -r onnx_models/* Fastspeech2_HS/phone_dict Fastspeech2_HS/multilingualcharmap.json triton_model_repo/tts/1
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
diff --git a/espnet_onnx/tts/abs_tts_model.py b/espnet_onnx/tts/abs_tts_model.py
index 591947a..9de64a5 100644
--- a/espnet_onnx/tts/abs_tts_model.py
+++ b/espnet_onnx/tts/abs_tts_model.py
@@ -86,20 +86,20 @@ class AbsTTSModel(AbsModel):
self._build_normalizer()
self._build_vocoder(providers, use_quantized)
- def _check_ort_version(self, providers: List[str]):
+ def _check_ort_version(self, providers: List):
# check cpu
if (
onnxruntime.get_device() == "CPU"
and "CPUExecutionProvider" not in providers
- ):
- raise RuntimeError(
- "If you want to use GPU, then follow `How to use GPU on espnet_onnx` chapter in readme to install onnxruntime-gpu."
- )
+ ): pass
+# raise RuntimeError(
+# "If you want to use GPU, then follow `How to use GPU on espnet_onnx` chapter in readme to install onnxruntime-gpu."
+# )
# check GPU
- if onnxruntime.get_device() == "GPU" and providers == ["CPUExecutionProvider"]:
- warnings.warn(
- "Inference will be executed on the CPU. Please provide gpu providers. Read `How to use GPU on espnet_onnx` in readme in detail."
- )
+ if onnxruntime.get_device() == "GPU" and providers == ["CPUExecutionProvider"]: pass
+# warnings.warn(
+# "Inference will be executed on the CPU. Please provide gpu providers. Read `How to use GPU on espnet_onnx` in readme in detail."
+# )
- logging.info(f'Providers [{" ,".join(providers)}] detected.')
+# logging.info(f'Providers [{" ,".join(providers)}] detected.')
diff --git a/espnet_onnx/tts/tts_model.py b/espnet_onnx/tts/tts_model.py
index 78023f5..de4ebba 100644
--- a/espnet_onnx/tts/tts_model.py
+++ b/espnet_onnx/tts/tts_model.py
@@ -14,7 +14,7 @@ class Text2Speech(AbsTTSModel):
self,
tag_name: str = None,
model_dir: Union[Path, str] = None,
- providers: List[str] = ["CPUExecutionProvider"],
+ providers: List = ["CPUExecutionProvider"],
use_quantized: bool = False,
):
assert check_argument_types()
diff --git a/espnet_onnx/utils/abs_model.py b/espnet_onnx/utils/abs_model.py
index 1270468..4aa63c6 100644
--- a/espnet_onnx/utils/abs_model.py
+++ b/espnet_onnx/utils/abs_model.py
@@ -46,23 +46,23 @@ class AbsModel(ABC):
def _build_model(self, providers, use_quantized):
raise NotImplementedError
- def _check_ort_version(self, providers: List[str]):
+ def _check_ort_version(self, providers: List):
# check cpu
if (
onnxruntime.get_device() == "CPU"
and "CPUExecutionProvider" not in providers
- ):
- raise RuntimeError(
- "If you want to use GPU, then follow `How to use GPU on espnet_onnx` chapter in readme to install onnxruntime-gpu."
- )
+ ): pass
+# raise RuntimeError(
+# "If you want to use GPU, then follow `How to use GPU on espnet_onnx` chapter in readme to install onnxruntime-gpu."
+# )
# check GPU
- if onnxruntime.get_device() == "GPU" and providers == ["CPUExecutionProvider"]:
- warnings.warn(
- "Inference will be executed on the CPU. Please provide gpu providers. Read `How to use GPU on espnet_onnx` in readme in detail."
- )
+ if onnxruntime.get_device() == "GPU" and providers == ["CPUExecutionProvider"]: pass
+# warnings.warn(
+# "Inference will be executed on the CPU. Please provide gpu providers. Read `How to use GPU on espnet_onnx` in readme in detail."
+# )
- logging.info(f'Providers [{" ,".join(providers)}] detected.')
+# logging.info(f'Providers [{" ,".join(providers)}] detected.')
class AbsExportModel(ABC):
diff --git a/setup.py b/setup.py
index 483b062..ee37d37 100644
--- a/setup.py
+++ b/setup.py
@@ -4,9 +4,9 @@ requirements = {
"install": [
"setuptools>=38.5.1",
"librosa>=0.8.0",
- "onnxruntime",
+ "onnxruntime-gpu",
"sentencepiece>=0.1.91,!=0.1.92",
- "typeguard==2.13.0",
+ "typeguard==2.13.3",
"PyYAML>=5.1.2",
"g2p-en",
"jamo==0.4.1", # For kss
diff --git a/text_preprocess_for_inference.py b/text_preprocess_for_inference.py
index ccca511..2191ebb 100644
--- a/text_preprocess_for_inference.py
+++ b/text_preprocess_for_inference.py
@@ -3,6 +3,8 @@ TTS Preprocessing
Developed by Arun Kumar A(CS20S013) - November 2022
Code Changes by Utkarsh - 2023
'''
+import locale
+locale.setlocale(locale.LC_ALL, 'C.UTF-8')
import os
import re
import json
@@ -40,14 +42,14 @@ def add_to_dictionary(dict_to_add, dict_file):
df_temp = pd.read_csv(temp_dict_file, delimiter=" ", header=None, dtype=str)
if len(df_temp) > len(df_orig):
os.rename(temp_dict_file, dict_file)
- print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}")
+ # print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}")
except:
print(traceback.format_exc())
else:
# create a new dictionary
with open(dict_file, "a") as f:
f.write(append_string)
- print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words")
+ # print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words")
class TextCleaner:
@@ -104,7 +106,7 @@ class Phonifier:
except Exception as e:
print(traceback.format_exc())
- print("Phone dictionary loaded for the following languages:", list(self.phone_dictionary.keys()))
+ # print("Phone dictionary loaded for the following languages:", list(self.phone_dictionary.keys()))
self.g2p = G2p()
print('Loading G2P model... Done!')
@@ -315,7 +317,7 @@ class Phonifier:
#print('INSIDE IF CONDITION OF ADDING WORDS')
else:
non_dict_words = words
- print(f"word not in dict: {non_dict_words}")
+ # print(f"word not in dict: {non_dict_words}")
if len(non_dict_words) > 0:
# unified parser has to be run for the non dictionary words
@@ -335,7 +337,7 @@ class Phonifier:
phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
# Create a string representation of the dictionary
data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
- print(f"data_str: {data_str}")
+ # print(f"data_str: {data_str}")
with open(out_dict_file, "w") as f:
f.write(data_str)
else:
@@ -358,7 +360,7 @@ class Phonifier:
for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
line = f"{original_word}\t{formatted_word}\n"
file.write(line)
- print(line, end='')
+ # print(line, end='')
try:
@@ -415,8 +417,8 @@ class Phonifier:
non_dict_words = words
if len(non_dict_words) > 0:
- print(len(non_dict_words))
- print(non_dict_words)
+ # print(len(non_dict_words))
+ # print(non_dict_words)
# unified parser has to be run for the non dictionary words
os.makedirs("tmp", exist_ok=True)
timestamp = str(time.time())
@@ -434,7 +436,7 @@ class Phonifier:
phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
# Create a string representation of the dictionary
data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
- print(f"data_str: {data_str}")
+ # print(f"data_str: {data_str}")
with open(out_dict_file, "w") as f:
f.write(data_str)
else:
@@ -454,12 +456,12 @@ class Phonifier:
for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
line = f"{original_word}\t{formatted_word}\n"
file.write(line)
- print(line, end='')
+ # print(line, end='')
try:
df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
new_dict = df.dropna().set_index(0).to_dict('dict')[1]
- print(new_dict)
+ # print(new_dict)
if language not in self.phone_dictionary:
self.phone_dictionary[language] = new_dict
else:
@@ -656,7 +658,7 @@ class TextNormalizer:
text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', text)
return self.__post_cleaning(text)
else:
- print(f"No num-to-char for the given language {language}.")
+ # print(f"No num-to-char for the given language {language}.")
return self.__post_cleaning(text)
def num2text_list(self, text, language):
@@ -671,7 +673,7 @@ class TextNormalizer:
output_text.append(line)
return self.__post_cleaning_list(output_text)
else:
- print(f"No num-to-char for the given language {language}.")
+ # print(f"No num-to-char for the given language {language}.")
return self.__post_cleaning_list(text)
def normalize(self, text, language):
@@ -758,9 +760,9 @@ class TTSDurAlignPreprocessor:
def preprocess(self, text, language, gender):
# text = text.strip()
- print(text)
+ # print(text)
text = self.text_cleaner.clean(text)
- print("cleaned text", text)
+ # print("cleaned text", text)
# text = self.text_normalizer.insert_space(text)
text = self.text_normalizer.num2text(text, language)
# print(text)
@@ -769,9 +771,9 @@ class TTSDurAlignPreprocessor:
phrasified_text = TextPhrasifier.phrasify(text)
#print("phrased",phrasified_text)
phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
- print("phonetext",phonified_text)
+ # print("phonetext",phonified_text)
phonified_text = self.post_processor.textProcesor(phonified_text)
- print(phonified_text)
+ # print(phonified_text)
return phonified_text, phrasified_text
class TTSDurAlignPreprocessor_VTT:
@@ -854,9 +856,9 @@ class TTSPreprocessor:
text = self.text_normalizer.normalize(text, language)
phrasified_text = TextPhrasifier.phrasify(text)
phonified_text = self.phonifier.phonify(phrasified_text, language, gender)
- print(phonified_text)
+ # print(phonified_text)
phonified_text = self.post_processor.textProcesorForEnglish(phonified_text)
- print(phonified_text)
+ # print(phonified_text)
return phonified_text, phrasified_text
class TTSPreprocessor_VTT:
#!/bin/bash
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh
sh Miniforge3-Linux-x86_64.sh -b -p ${HOME}/conda
rm Miniforge3-Linux-x86_64.sh
source "${HOME}/conda/etc/profile.d/conda.sh"
source "${HOME}/conda/etc/profile.d/mamba.sh"
conda create -y --name text2phone python=3.10 --no-default-packages
conda activate text2phone
git clone --recursive https://github.com/espnet/espnet_onnx.git
cd espnet_onnx && git apply /patches/espnet_onnx.patch && python setup.py bdist_wheel && cd ..
pip install -U nbconvert ipykernel onnx torch==2.0.1 torchaudio indic-num2words espnet_model_zoo espnet_onnx/dist/espnet_onnx-0.2.0-py3-none-any.whl espnet
mkdir -p /home/builder/nltk_data/corpora && wget --directory-prefix=/home/builder/nltk_data/corpora "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/cmudict.zip" && unzip /home/builder/nltk_data/corpora/cmudict.zip -d /home/builder/nltk_data/corpora
mkdir -p /home/builder/nltk_data/taggers && wget --directory-prefix=/home/builder/nltk_data/taggers https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip && unzip /home/builder/nltk_data/taggers/averaged_perceptron_tagger.zip -d /home/builder/nltk_data/taggers
rm -rf espnet_onnx
mkdir -p ~/.cache/espnet_onnx && jupyter nbconvert --inplace --to notebook --execute /notebooks/create_onnx.ipynb
\ No newline at end of file
import os
os.environ["NLTK_DATA"] = "."
os.chdir(os.path.dirname(__file__))
import json
import numpy
import onnxruntime
from sys import path
MAX_WAV_VALUE = 32768.0
SAMPLING_RATE = 22050
from random import choice
from itertools import product
from espnet_onnx import Text2Speech
import triton_python_backend_utils as pb_utils
from text_preprocess_for_inference import (
TTSDurAlignPreprocessor,
CharTextPreprocessor,
TTSPreprocessor,
)
LANGMAP = {
"as": ("assamese", "aryan"),
"bn": ("bengali", "aryan"),
"brx": ("bodo", "aryan"),
"en": ("english", "aryan"),
"gu": ("gujarati", "aryan"),
"hi": ("hindi", "aryan"),
"kn": ("kannada", "dravidian"),
"ml": ("malayalam", "dravidian"),
"mni": ("manipuri", "aryan"),
"mr": ("marathi", "aryan"),
"or": ("odia", "aryan"),
"pa": ("punjabi", "aryan"),
"rj": ("rajasthani", "aryan"),
"ta": ("tamil", "dravidian"),
"te": ("telugu", "dravidian"),
"ur": ("urdu", "aryan"),
}
class TritonPythonModel:
def initialize(self, args):
self.device_id = int(json.loads(args["model_instance_device_id"]))
self.target_dtype = pb_utils.triton_string_to_numpy(
pb_utils.get_output_config_by_name(
json.loads(args["model_config"]), "OUTPUT_GENERATED_AUDIO"
)["data_type"]
)
self.tts_preprocessor = TTSPreprocessor()
self.char_text_preprocessor = CharTextPreprocessor()
self.tts_dur_align_preprocessor = TTSDurAlignPreprocessor()
self.preprocessors = {}
for lang, _ in LANGMAP.values():
if lang == "urdu" or lang == "punjabi":
self.preprocessors[lang] = self.char_text_preprocessor
elif lang == "english":
self.preprocessors[lang] = self.tts_preprocessor
else:
self.preprocessors[lang] = self.tts_dur_align_preprocessor
self.models = {}
for (language, _), gender in product(LANGMAP.values(), ("male",)):
try:
self.models[(language, gender)] = self.load_fastspeech2_model(
language,
gender,
f"cuda",
)
except:
pass
self.vocoders = {
(gender, family): self.load_vocoder(gender, family, "cuda")
for gender, family in product(("male",), ("aryan", "dravidian"))
}
def load_vocoder(self, gender, family, device):
return onnxruntime.InferenceSession(
f"vocoders/{gender}-{family}-vocoder.onnx",
providers=[
"CPUExecutionProvider"
if device == "cpu"
else ("CUDAExecutionProvider", {"device_id": self.device_id})
],
)
def load_fastspeech2_model(self, language, gender, device):
model = Text2Speech(
providers=[
"CPUExecutionProvider"
if device == "cpu"
else ("CUDAExecutionProvider", {"device_id": self.device_id})
],
model_dir=f"text2phone/{language}-{gender}-ort",
use_quantized=True,
)
return model
def determine_gender(self, name):
if name.lower() in ("m", "male"):
return "male"
elif name.lower() in ("f", "fem", "female"):
return "female"
else:
return choice(["male", "female"])
def synthesize_audio(self, text, lang_id, speaker_id):
(language, family), gender = LANGMAP[
lang_id[0].decode("utf-8")
], self.determine_gender(speaker_id[0].decode("utf-8"))
preprocessor = self.preprocessors[language]
preprocessed_text = " ".join(
preprocessor.preprocess(text[0].decode("utf=8"), language, gender)[0]
)
model, vocoder = (
self.models[(language, gender)],
self.vocoders[(gender, family)],
)
x = (
numpy.expand_dims(
model.postprocess(
model.tts_model(
model.preprocess.token_id_converter.tokens2ids(
model.preprocess.tokenizer.text2tokens(preprocessed_text)
)
)["feat_gen"]
).T,
axis=0,
)
* 2.3262
)
y_g_hat = vocoder.run(None, {"input": x})[0]
audio = y_g_hat.squeeze() * MAX_WAV_VALUE
return audio.astype("int16")
def execute(self, requests):
return [
pb_utils.InferenceResponse(
output_tensors=[
pb_utils.Tensor(
"OUTPUT_GENERATED_AUDIO",
numpy.array(
[[processed_sent] for processed_sent in processed_sents],
dtype=self.target_dtype,
),
)
]
)
for processed_sents in (
(
self.synthesize_audio(
input_text, input_language_id, input_speaker_id
).tobytes()
for input_text, input_speaker_id, input_language_id in zip(
input_texts.as_numpy(),
input_speaker_ids.as_numpy(),
input_language_ids.as_numpy(),
)
)
for input_texts, input_speaker_ids, input_language_ids in (
(
pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT"),
pb_utils.get_input_tensor_by_name(request, "INPUT_SPEAKER_ID"),
pb_utils.get_input_tensor_by_name(request, "INPUT_LANGUAGE_ID"),
)
for request in requests
)
)
]
def finalize(self):
pass
name: "tts"
backend: "python"
max_batch_size: 64
input [
{
name: "INPUT_TEXT"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
input [
{
name: "INPUT_SPEAKER_ID"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
input [
{
name: "INPUT_LANGUAGE_ID"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
output [
{
name: "OUTPUT_GENERATED_AUDIO"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
dynamic_batching {}
instance_group [
{
count: 1
kind: KIND_GPU
}
]
parameters: {
key: "EXECUTION_ENV_PATH",
value: {string_value: "$$TRITON_MODEL_DIRECTORY/tts.tar.gz"}
}
\ No newline at end of file
#!/bin/bash
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh
sh Miniforge3-Linux-x86_64.sh -b -p ${HOME}/conda
rm Miniforge3-Linux-x86_64.sh
source "${HOME}/conda/etc/profile.d/conda.sh"
source "${HOME}/conda/etc/profile.d/mamba.sh"
conda install -y conda-pack
export PYTHONNOUSERSITE=True
conda create -y --name tts python=3.11 --no-default-packages
conda activate tts
mamba install -c "nvidia/label/cuda-11.8.0" libcublas libcufft cuda-cudart -y
git clone --recursive https://github.com/espnet/espnet_onnx.git
cd espnet_onnx && git apply /patches/espnet_onnx.patch && python setup.py bdist_wheel && cd ..
pip install -U numpy pandas nltk indic-num2words g2p_en "espnet_onnx/dist/espnet_onnx-0.2.0-py3-none-any.whl"
conda deactivate
conda pack -n tts
conda activate tts
mkdir -p /model_repo/tts/1/corpora && wget --directory-prefix=/model_repo/tts/1/corpora "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/cmudict.zip" && unzip /model_repo/tts/1/corpora/cmudict.zip -d /model_repo/tts/1/corpora
mkdir -p /model_repo/tts/1/taggers && wget --directory-prefix=/model_repo/tts/1/taggers https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip && unzip /model_repo/tts/1/taggers/averaged_perceptron_tagger.zip -d /model_repo/tts/1/taggers
git clone https://github.com/triton-inference-server/python_backend -b r23.10
cd python_backend && mkdir build && cd build
cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG=r23.10 -DTRITON_COMMON_REPO_TAG=r23.10 -DTRITON_CORE_REPO_TAG=r23.10 -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
make -j$(nproc) triton-python-backend-stub && cd ../..
mv tts.tar.gz python_backend/build/triton_python_backend_stub /model_repo/tts
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment