In [1]:
import os
import json
import yaml
import numpy
import torch
!mkdir -p ../wavs
import onnxruntime
from sys import path
from tqdm import tqdm
SAMPLING_RATE = 22050
os.chdir('../Fastspeech2_HS')
path.append("hifigan")
from env import AttrDict
from models import Generator
from IPython.display import Audio
from scipy.io.wavfile import write
from meldataset import MAX_WAV_VALUE
from espnet_onnx.export import TTSModelExport
from espnet2.bin.tts_inference import Text2Speech
from espnet_onnx import Text2Speech as Text2SpeechInference
from text_preprocess_for_inference import TTSDurAlignPreprocessor, CharTextPreprocessor, TTSPreprocessor

Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!
Loading G2P model... Done!


Original Inference
* uses the environment defined in Fastspeech2_HS repo

In [2]:
def load_hifigan_vocoder(language, gender, family, device):
 vocoder_config = f"vocoder/{gender}/{family}/hifigan/config.json"
 vocoder_generator = f"vocoder/{gender}/{family}/hifigan/generator"
 with open(vocoder_config, 'r') as f: json_config = json.load(f)
 h = AttrDict(json_config)
 torch.manual_seed(h.seed)
 device = torch.device(device)
 generator = Generator(h).to(device)
 state_dict_g = torch.load(vocoder_generator, device)
 generator.load_state_dict(state_dict_g['generator'])
 generator.eval()
 generator.remove_weight_norm()
 return generator
def load_fastspeech2_model(language, gender, device):
 with open(f"{language}/{gender}/model/config.yaml", "r") as file: config = yaml.safe_load(file)
 current_working_directory = os.getcwd()
 feat = "model/feats_stats.npz"
 pitch = "model/pitch_stats.npz"
 energy = "model/energy_stats.npz"
 feat_path = os.path.join(current_working_directory, language, gender, feat)
 pitch_path = os.path.join(current_working_directory, language, gender, pitch)
 energy_path = os.path.join(current_working_directory, language, gender, energy)
 config["normalize_conf"]["stats_file"] = feat_path
 config["pitch_normalize_conf"]["stats_file"] = pitch_path
 config["energy_normalize_conf"]["stats_file"] = energy_path
 with open(f"{language}/{gender}/model/config.yaml", "w") as file: yaml.dump(config, file)
 tts_model = f"{language}/{gender}/model/model.pth"
 tts_config = f"{language}/{gender}/model/config.yaml"
 return Text2Speech(train_config=tts_config, model_file=tts_model, device=device)
def text_synthesis(language, gender, sample_text, vocoder, MAX_WAV_VALUE, device):
 with torch.no_grad():
 model = load_fastspeech2_model(language, gender, device)
 out = model(sample_text, decode_conf={"alpha": 1})
 x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
 x = x.to(device)
 y_g_hat = vocoder(x)
 audio = y_g_hat.squeeze()
 audio = audio * MAX_WAV_VALUE
 audio = audio.cpu().numpy().astype('int16')
 return audio
def text2speech(language, gender, family, sample_text, device):
 vocoder = load_hifigan_vocoder(language, gender, family, device)
 if language == "urdu" or language == "punjabi": preprocessor = CharTextPreprocessor()
 elif language == "english": preprocessor = TTSPreprocessor()
 else: preprocessor = TTSDurAlignPreprocessor()
 preprocessed_text, phrases = preprocessor.preprocess(sample_text, language, gender)
 preprocessed_text = " ".join(preprocessed_text)
 audio = text_synthesis(language, gender, preprocessed_text, vocoder, MAX_WAV_VALUE, device)
 output_file = f"../wavs/{language}_{gender}-{family}_orig_output.wav"
 write(output_file, SAMPLING_RATE, audio)

Original Inference Results

In [3]:
audio_orig = text_synthesis('english', 'male', 'this is a sentence', load_hifigan_vocoder('english', 'male', 'aryan', 'cpu'), MAX_WAV_VALUE, 'cpu')
print('length:', len(audio_orig), 'array:', audio_orig)
print('abssum:', numpy.abs(audio_orig).sum(), 'min:', audio_orig.min(), 'max:', audio_orig.max())

Removing weight norm...
length: 26624 array: [945 894 605 ... 10 12 30]
abssum: 42271788 min: -16131 max: 10878


In [4]:
audio_orig = text_synthesis('english', 'male', 'this is a sentence', load_hifigan_vocoder('english', 'male', 'aryan', 'cuda'), MAX_WAV_VALUE, 'cuda')
print('length:', len(audio_orig), 'array:', audio_orig)
print('abssum:', numpy.abs(audio_orig).sum(), 'min:', audio_orig.min(), 'max:', audio_orig.max())

Removing weight norm...
length: 26624 array: [945 894 605 ... 10 12 30]
abssum: 42271796 min: -16131 max: 10878


Latest Pytorch Inference Results
* similar environment as defined in scripts/perform_onnx_conversion

In [3]:
audio_orig = text_synthesis('english', 'male', 'this is a sentence', load_hifigan_vocoder('english', 'male', 'aryan', 'cpu'), MAX_WAV_VALUE, 'cpu')
print('length:', len(audio_orig), 'array:', audio_orig)
print('abssum:', numpy.abs(audio_orig).sum(), 'min:', audio_orig.min(), 'max:', audio_orig.max())



Removing weight norm...
length: 26624 array: [945 894 605 ... 10 12 30]
abssum: 42271783 min: -16131 max: 10878


In [None]:
audio_orig = text_synthesis('english', 'male', 'this is a sentence', load_hifigan_vocoder('english', 'male', 'aryan', 'cuda'), MAX_WAV_VALUE, 'cuda')
print('length:', len(audio_orig), 'array:', audio_orig)
print('abssum:', numpy.abs(audio_orig).sum(), 'min:', audio_orig.min(), 'max:', audio_orig.max())

ORT Conversion

In [4]:
!mkdir -p ../ort_models
def convert_to_ort(language, gender, family):
 vocoder = load_hifigan_vocoder(language, gender, family, 'cpu')
 model = load_fastspeech2_model(language, gender, 'cpu')
 if language == "urdu" or language == "punjabi": preprocessor = CharTextPreprocessor()
 elif language == "english": preprocessor = TTSPreprocessor()
 else: preprocessor = TTSDurAlignPreprocessor()
 preprocessed_text, phrases = preprocessor.preprocess('this is a sentence', language, gender)
 preprocessed_text = " ".join(preprocessed_text)
 exporter = TTSModelExport()
 exporter.export(model, f'{language}-{gender}-ort', quantize=False)
 out = model(preprocessed_text, decode_conf={"alpha": 1})
 x = out["feat_gen_denorm"].T.unsqueeze(0) * 2.3262
 torch.onnx.export(vocoder, x, f'../ort_models/vocoders/{gender}-{family}-vocoder.onnx', input_names=['input'], output_names=['output'], dynamic_axes={'input': [0, 2], 'output': [0]})

In [None]:
convert_to_ort('english', 'male', 'aryan')

ORT Inference
* environment as defined in triton_models/tts/envbuilder.sh
* you can delete the ort_models folder

In [2]:
def load_hifigan_vocoder(language, gender, family, device): return onnxruntime.InferenceSession(f"../ort_models/vocoders/{gender}-{family}-vocoder.onnx", providers=['CPUExecutionProvider' if device == 'cpu' else 'CUDAExecutionProvider'])
def load_fastspeech2_model(language, gender, device): return Text2SpeechInference(f'{language}-{gender}-ort', providers=['CPUExecutionProvider' if device == 'cpu' else 'CUDAExecutionProvider'])
def text_synthesis(language, gender, sample_text, vocoder, MAX_WAV_VALUE, device):
 model = load_fastspeech2_model(language, gender, device)
 x = numpy.expand_dims(model.postprocess(model.tts_model(model.preprocess.token_id_converter.tokens2ids(model.preprocess.tokenizer.text2tokens(sample_text)))['feat_gen']).T, axis=0) * 2.3262
 y_g_hat = vocoder.run(None, {'input': x})[0]
 audio = y_g_hat.squeeze()
 audio = audio * MAX_WAV_VALUE
 audio = audio.astype('int16')
 return audio
def text2speech(language, gender, family, sample_text, device):
 vocoder = load_hifigan_vocoder(language, gender, family, device)
 if language == "urdu" or language == "punjabi": preprocessor = CharTextPreprocessor()
 elif language == "english": preprocessor = TTSPreprocessor()
 else: preprocessor = TTSDurAlignPreprocessor()
 preprocessed_text, phrases = preprocessor.preprocess(sample_text, language, gender)
 preprocessed_text = " ".join(preprocessed_text)
 audio = text_synthesis(language, gender, preprocessed_text, vocoder, MAX_WAV_VALUE, device)
 output_file = f"../wavs/{language}_{gender}-{family}_ort_output.wav"
 write(output_file, SAMPLING_RATE, audio)

ORT Inference Results

In [None]:
audio_orig = text_synthesis('english', 'male', 'this', load_hifigan_vocoder('english', 'male', 'aryan', 'cpu'), MAX_WAV_VALUE, 'cpu')
print('length:', len(audio_orig), 'array:', audio_orig)
print('abssum:', numpy.abs(audio_orig).sum(), 'min:', audio_orig.min(), 'max:', audio_orig.max())