initial commit

2db3611d · Nikhilesh Bhatnagar · 8e5e75e3 · 2db3611d · 2db3611d · 2db3611d
Commit 2db3611d authored Feb 26, 2024 by Nikhilesh Bhatnagar
185 changed files
--- a/README.md
+++ b/README.md
-# dhruva-iitm-asr
+# Dhruva IITM ASR

+## How to get the models
+
+### For Indian Languages
+Store the models and the respective dictionary files in the folder `asr/1/models`. The languages are listed on the website `https://asr.iitm.ac.in/models` 
+The base url for the model is `wget https://asr.iitm.ac.in/SPRING_INX/models/fine_tuned/SPRING_INX_data2vec_aqc_Bengali.pt`
+For dictionar is `wget https://asr.iitm.ac.in/SPRING_INX/models/dictionaries/SPRING_INX_Urdu_dict.txt`
+
+
+### For english whisper
+Place the model files in `asr/1/whisper_models`
+* Install `faster_whisper`
+* In a python interpreter import `faster_whisper` and run `model = faster_whisper.WhisperModel('large-v3', device='cuda', compute_type="int8", device_index=1, download_root='/path/to/repo/asr/1/whisper_models')`
+* That will store the model files in that custom location
+
+## To create the environment
+* Install conda pack
+* Create a new python 3.10 environment
+* Clone `https://github.com/Speech-Lab-IITM/data2vec-aqc/tree/master` and `git apply` the patch `aqc.patch`
+* You can now create the wheel using `python setup.py bdist_wheel`
+* Do the same for `https://github.com/Spijkervet/torchaudio-augmentations`
+* Do the same for `https://github.com/flashlight/sequence`
+* And do `pip install git+https://github.com/kpu/kenlm.git fast_pytorch_kmeans tensorboardX flashlight-text soundfile torchaudio data2vec-aqc/dist/fairseq-0.12.2-cp310-cp310-linux_x86_64.whl sequence/dist/flashlight_sequence-0.0.0+91e2b0f.d20240210-cp310-cp310-linux_x86_64.whl torchaudio-augmentations/dist/torchaudio_augmentations-0.2.4-py3-none-any.whl faster-whisper`
+* Finally use conda pack to save the env.
--- a/aqc.patch
+++ b/aqc.patch
+diff --git a/examples/data2vec/models/data2vec_audio.py b/examples/data2vec/models/data2vec_audio.py
+index 5cf5b53..ccaf4a4 100644
+--- a/examples/data2vec/models/data2vec_audio.py
+++ b/examples/data2vec/models/data2vec_audio.py
+@@ -423,7 +423,7 @@ class Data2VecAudioModel(BaseFairseqModel):
+ 
+         # extracting both the source and the augmented audios from source
+         source_audios = source[0]
+-        aug_audios = source[1]
+        aug_audios = source[0]
+ 
+         if self.feature_grad_mult > 0:
+             features = self.feature_extractor(source_audios) #features
+diff --git a/setup.py b/setup.py
+index a7ce61a..3dc7b44 100644
+--- a/setup.py
+++ b/setup.py
+@@ -105,12 +105,12 @@ try:
+                     "fairseq/clib/libnat/edit_dist.cpp",
+                 ],
+             ),
+-            cpp_extension.CppExtension(
+-                "alignment_train_cpu_binding",
+-                sources=[
+-                    "examples/operators/alignment_train_cpu.cpp",
+-                ],
+-            ),
+            # cpp_extension.CppExtension(
+            #     "alignment_train_cpu_binding",
+            #     sources=[
+            #         "examples/operators/alignment_train_cpu.cpp",
+            #     ],
+            # ),
+         ]
+     )
+     if "CUDA_HOME" in os.environ:
--- a/triton_inference.ipynb
+++ b/triton_inference.ipynb
--- a/triton_model/1/aqc/examples/.gitignore
+++ b/triton_model/1/aqc/examples/.gitignore
+!*/*.sh
+!*/*.md
--- a/triton_model/1/aqc/examples/__init__.py
+++ b/triton_model/1/aqc/examples/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+try:
+    from fairseq.version import __version__  # noqa
+except ImportError:
+    pass
--- a/triton_model/1/aqc/examples/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/data2vec/README.md
+++ b/triton_model/1/aqc/examples/data2vec/README.md
+
+# data2vec-aqc 
+
+
+## Training a new speech model with the CLI tools
+
+Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length)
+
+### Prepare training data manifest:
+
+First, install the `soundfile` library:
+```shell script
+pip install soundfile
+```
+
+Next, run:
+
+```shell script
+$ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext $ext --valid-percent $valid
+```
+
+$ext should be set to flac, wav, or whatever format your dataset happens to use that soundfile can read.
+
+$valid should be set to some reasonable percentage (like 0.01) of training data to use for validation.
+To use a pre-defined validation set (like dev-other from librispeech), set to it 0 and then overwrite valid.tsv with a
+separately pre-processed manifest file.
+
+### Train a data2vec-aqc Base model:
+
+This configuration was used for the base model trained on the Librispeech dataset in the data2vec-aqc paper
+
+Note that the input is expected to be single channel, sampled at 16 kHz
+
+```shell script
+$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/audio/pretraining \
+--config-name base_librispeech task.data=/path/to/manifests common.user_dir=examples/data2vec
+```
+
+Note: you can simulate 16 GPUs by using k GPUs and adding command line parameters
+`distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 16/k
+
+### Parameters of interest
+
+* The `cluster_factor` and `scale_factor` parameters (for the clustering module) can be modified from the `model` section of the pre-training configs which can be found from the [pre-training config](https://github.com/Speech-Lab-IITM/data2vec-aqc/examples/data2vec/config/audio/pretraining).
+* The augmentations used for data2vec-aqc requires the noise set of MUSAN dataset. The path to the same is to be specified in the `path_to_musan_noise_set` variable of the __getitem__ method of the [raw_audio_dataset](https://github.com/Speech-Lab-IITM/data2vec-aqc/fairseq/data/audio/raw_audio_dataset.py) file.
+
+### Fine-tune a pre-trained model with CTC:
+
+Fine-tuning a model requires parallel audio and labels file, as well as a vocabulary file in fairseq format.
+A letter vocabulary can be downloaded [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt).
+An example [script](../wav2vec/libri_labels.py) that generates labels for the Librispeech dataset from the tsv file produced by wav2vec_manifest.py can be used as follows:
+
+```shell script
+split=train
+$ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $split
+```
+
+Fine-tuning on 100h of Librispeech with letter targets:
+```shell script
+$ fairseq-hydra-train \
+    distributed_training.distributed_port=$PORT \
+    task.data=/path/to/data \
+    model.w2v_path=/path/to/model.pt \
+    --config-dir /path/to/fairseq-py/examples/wav2vec/config/finetuning \
+    --config-name base_100h common.user_dir=examples/data2vec
+```
+
+There are other config files in the config/finetuning directory that can be used to fine-tune on other splits.
+You can specify the right config via the `--config-name` parameter.
+
+Decoding with a language model during training requires flashlight [python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter).
+If you want to use a language model, add `+criterion.wer_args='[/path/to/kenlm, /path/to/lexicon, 2, -1]'` to the command line.
+
+### Evaluating a CTC model:
+
+Evaluating a CTC model with a language model requires [flashlight python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter) to be installed.
+
+Fairseq transformer language model used in the wav2vec 2.0 paper can be obtained from the [wav2letter model repository](https://github.com/facebookresearch/wav2letter/tree/master/recipes/sota/2019).
+Be sure to upper-case the language model vocab after downloading it.
+
+Letter dictionary for pre-trained models can be found [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt).
+
+Next, run the evaluation command:
+
+```shell script
+python examples/speech_recognition/new/infer.py --config-dir examples/speech_recognition/new/conf \
+--config-name infer task=audio_finetuning task.data=/path/to/manifests common.user_dir=examples/data2vec \
+task.labels=ltr decoding.type=kenlm \
+decoding.lmweight=${lmweight} decoding.wordscore=${wordscore} decoding.silweight=${silscore} \
+decoding.lexicon=/path/to/lexicon \
+decoding.lmpath=/path/to/lm decoding.unique_wer_file=True \
+dataset.gen_subset=dev_clean,dev_other,test_clean,test_other \
+common_eval.path=/path/to/checkpoint.pt decoding.beam=1500 distributed_training.distributed_world_size=${num_gpus}
+```
+
+To get raw numbers, use decoding.type=viterbi and omit the lexicon. To use the transformer language model, use decoding.type=fairseqlm.
+
--- a/triton_model/1/aqc/examples/data2vec/__init__.py
+++ b/triton_model/1/aqc/examples/data2vec/__init__.py
+from . import models
\ No newline at end of file
--- a/triton_model/1/aqc/examples/data2vec/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/data2vec/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/data2vec/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/data2vec/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/data2vec/config/audio/pretraining/base_librispeech.yaml
+++ b/triton_model/1/aqc/examples/data2vec/config/audio/pretraining/base_librispeech.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tb
+
+checkpoint:
+  save_interval: 5
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 320000
+  min_sample_size: 32000
+  normalize: true
+
+dataset:
+  num_workers: 6
+  max_tokens: 3800000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  required_batch_size_multiple: 1
+  disable_validation: false
+
+distributed_training:
+  distributed_world_size: 16
+  ddp_backend: legacy_ddp
+
+criterion:
+  _name: model
+  log_keys:
+    - ema_decay
+    - target_var
+    - pred_var
+
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.03,0.9,0.07]
+
+model:
+  _name: data2vec_audio
+  extractor_mode: layer_norm
+  encoder_layerdrop: 0.05
+  dropout_input: 0.0
+  dropout_features: 0.0
+  feature_grad_mult: 1.0
+  encoder_embed_dim: 768
+
+  mask_prob: 0.65
+  mask_length: 10
+
+  loss_beta: 0
+  loss_scale: null
+
+  instance_norm_target_layer: true
+  average_top_k_layers: 8
+
+  pos_conv_depth: 5
+  conv_pos: 95
+
+  ema_decay: 0.999
+  ema_end_decay: 0.9999
+  ema_anneal_end_step: 30000
+  ema_transformer_only: true
+  ema_layers_only: true
+
+  require_same_masks: true
+  mask_dropout: 0
+
+  final_proj_dim: 256
+  cluster_factor: 16
+  scale_factor: 0.3
+  num_negatives: 100
+  cross_sample_negatives: 0
--- a/triton_model/1/aqc/examples/data2vec/models/__init__.py
+++ b/triton_model/1/aqc/examples/data2vec/models/__init__.py
+import importlib
+import os
+
+
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        model_name = file[: file.find(".py")]
+        importlib.import_module("examples.data2vec.models." + model_name)
--- a/triton_model/1/aqc/examples/data2vec/models/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/data2vec/models/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/data2vec/models/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/data2vec/models/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/data2vec/models/__pycache__/data2vec_audio.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/data2vec/models/__pycache__/data2vec_audio.cpython-310.pyc
--- a/triton_model/1/aqc/examples/data2vec/models/__pycache__/data2vec_audio.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/data2vec/models/__pycache__/data2vec_audio.cpython-39.pyc
--- a/triton_model/1/aqc/examples/data2vec/models/__pycache__/utils.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/data2vec/models/__pycache__/utils.cpython-310.pyc
--- a/triton_model/1/aqc/examples/data2vec/models/data2vec_audio.py
+++ b/triton_model/1/aqc/examples/data2vec/models/data2vec_audio.py
--- a/triton_model/1/aqc/examples/data2vec/models/utils.py
+++ b/triton_model/1/aqc/examples/data2vec/models/utils.py
+def index_put_scale(tensor, indices, value):
+    tensor[indices] *= value
+    return tensor
\ No newline at end of file
--- a/triton_model/1/aqc/examples/language_model/README.adaptive_inputs.md
+++ b/triton_model/1/aqc/examples/language_model/README.adaptive_inputs.md
+# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)
+
+## Pre-trained models
+
+Description | Parameters | Dataset | Model and Test set(s)
+---|---:|---|---
+Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
+Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
+
+## Training an LM with adaptive inputs
+
+First, see the general [language modeling README](README.md) for instructions on
+preprocessing the WikiText-103 data.
+
+Then use the following training command to train a model with adaptive inputs
+using the `transformer_lm_wiki103` model architecture:
+```bash
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/transformer_wikitext-103 \
+    --arch transformer_lm_wiki103 \
+    --max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
+    --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \
+    --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
+    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp
+```
+
+## Citation
+
+```bibtex
+@inproceedings{
+    baevski2018adaptive,
+    title={Adaptive Input Representations for Neural Language Modeling},
+    author={Alexei Baevski and Michael Auli},
+    booktitle={International Conference on Learning Representations},
+    year={2019},
+    url={https://openreview.net/forum?id=ByxZX20qFQ},
+}
+```
--- a/triton_model/1/aqc/examples/language_model/README.conv.md
+++ b/triton_model/1/aqc/examples/language_model/README.conv.md
+# Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)
+
+## Example usage
+
+First download and preprocess the data following the main [language modeling README](README.md).
+
+Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103`
+architecture:
+```bash
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/fconv_wikitext-103 \
+    --arch fconv_lm_dauphin_wikitext103 \
+    --adaptive-softmax-cutoff 10000,20000,200000 \
+    --dropout 0.2 \
+    --criterion adaptive_loss \
+    --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
+    --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
+    --max-tokens 1024 --tokens-per-sample 1024 \
+    --ddp-backend legacy_ddp \
+    --max-epoch 35
+```
+
+And evaluate with:
+```bash
+fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt
+```
+
+## Citation
+
+```bibtex
+@inproceedings{dauphin2017language,
+  title={Language Modeling with Gated Convolutional Networks},
+  author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},
+  booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
+  pages={933--941},
+  year={2017},
+  organization={JMLR}
+}
+```
--- a/triton_model/1/aqc/examples/language_model/README.md
+++ b/triton_model/1/aqc/examples/language_model/README.md
+# Neural Language Modeling
+
+## Pre-trained models
+
+Model | Description | Dataset | Download
+---|---|---|---
+`transformer_lm.gbw.adaptive_huge` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
+`transformer_lm.wiki103.adaptive` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 247M params | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
+`transformer_lm.wmt19.en` | English LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
+`transformer_lm.wmt19.de` | German LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
+`transformer_lm.wmt19.ru` | Russian LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
+
+## Example usage
+
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install fastBPE sacremoses
+```
+
+To sample from a language model using PyTorch Hub:
+```python
+import torch
+
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'transformer_lm.wmt19.en', ...]
+
+# Load an English LM trained on WMT'19 News Crawl data
+en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
+en_lm.eval()  # disable dropout
+
+# Move model to GPU
+en_lm.cuda()
+
+# Sample from the language model
+en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
+# "Barack Obama is coming to Sydney and New Zealand (...)"
+
+# Compute perplexity for a sequence
+en_lm.score('Barack Obama is coming to Sydney and New Zealand')['positional_scores'].mean().neg().exp()
+# tensor(15.1474)
+
+# The same interface can be used with custom models as well
+from fairseq.models.transformer_lm import TransformerLanguageModel
+custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
+custom_lm.sample('Barack Obama', beam=5)
+# "Barack Obama (...)"
+```
+
+## Training a transformer language model with the CLI tools
+
+### 1) Preprocess the data
+
+First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
+```bash
+cd examples/language_model/
+bash prepare-wikitext-103.sh
+cd ../..
+```
+
+Next preprocess/binarize the data:
+```bash
+TEXT=examples/language_model/wikitext-103
+fairseq-preprocess \
+    --only-source \
+    --trainpref $TEXT/wiki.train.tokens \
+    --validpref $TEXT/wiki.valid.tokens \
+    --testpref $TEXT/wiki.test.tokens \
+    --destdir data-bin/wikitext-103 \
+    --workers 20
+```
+
+### 2) Train a language model
+
+Next we'll train a basic transformer language model on wikitext-103. For more
+advanced usage, see the [adaptive inputs README](README.adaptive_inputs.md).
+
+To train a basic LM (assumes 2 GPUs):
+```
+$ fairseq-train --task language_modeling \
+  data-bin/wikitext-103 \
+  --save-dir checkpoints/transformer_wikitext-103 \
+  --arch transformer_lm --share-decoder-input-output-embed \
+  --dropout 0.1 \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
+  --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+  --tokens-per-sample 512 --sample-break-mode none \
+  --max-tokens 2048 --update-freq 16 \
+  --fp16 \
+  --max-update 50000
+```
+
+If you run out of memory, try reducing `--max-tokens` (max number of tokens per
+batch) or `--tokens-per-sample` (max sequence length). You can also adjust
+`--update-freq` to accumulate gradients and simulate training on a different
+number of GPUs.
+
+### 3) Evaluate
+
+```bash
+fairseq-eval-lm data-bin/wikitext-103 \
+    --path checkpoints/transformer_wiki103/checkpoint_best.pt \
+    --batch-size 2 \
+    --tokens-per-sample 512 \
+    --context-window 400
+# | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s)
+# | Loss: 3.4164, Perplexity: 30.46
+```
+
+*Note:* The `--context-window` option controls how much context is provided to
+each token when computing perplexity. When the window size is 0, the dataset is
+chunked into segments of length 512 and perplexity is computed over each segment
+normally. However, this results in worse (higher) perplexity since tokens that
+appear earlier in each segment have less conditioning. When the maximum window
+size is used (511 in this case), then we compute perplexity for each token
+fully conditioned on 511 tokens of context. This slows down evaluation
+significantly, since we must run a separate forward pass for every token in the
+dataset, but results in better (lower) perplexity.
+
+
+## Convolutional language models
+
+Please see the [convolutional LM README](README.conv.md) for instructions on
+training convolutional language models.
--- a/triton_model/1/aqc/examples/language_model/prepare-wikitext-103.sh
+++ b/triton_model/1/aqc/examples/language_model/prepare-wikitext-103.sh
+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+
+URLS=(
+    "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
+)
+FILES=(
+    "wikitext-103-v1.zip"
+)
+
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit -1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        elif [ ${file: -4} == ".zip" ]; then
+            unzip $file
+        fi
+    fi
+done
+cd ..
--- a/triton_model/1/aqc/examples/speech_recognition/README.md
+++ b/triton_model/1/aqc/examples/speech_recognition/README.md
+### 2021 Update: We are merging this example into the [S2T framework](../speech_to_text), which supports more generic speech-to-text tasks (e.g. speech translation) and more flexible data processing pipelines. Please stay tuned.
+
+# Speech Recognition
+`examples/speech_recognition` is implementing ASR task in Fairseq, along with needed features, datasets, models and loss functions to train and infer model described in [Transformers with convolutional context for ASR (Abdelrahman Mohamed et al., 2019)](https://arxiv.org/abs/1904.11660).
+
+
+## Additional dependencies
+On top of main fairseq dependencies there are couple more additional requirements.
+
+1) Please follow the instructions to install [torchaudio](https://github.com/pytorch/audio). This is required to compute audio fbank features.
+2) [Sclite](http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/sclite.htm#sclite_name_0) is used to measure WER. Sclite can be downloaded and installed from source from sctk package [here](http://www.openslr.org/4/). Training and inference doesn't require Sclite dependency.
+3) [sentencepiece](https://github.com/google/sentencepiece) is required in order to create dataset with word-piece targets.
+
+## Preparing librispeech data
+```
+./examples/speech_recognition/datasets/prepare-librispeech.sh $DIR_TO_SAVE_RAW_DATA $DIR_FOR_PREPROCESSED_DATA
+```
+
+## Training librispeech data
+```
+python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 80 --task speech_recognition --arch vggtransformer_2 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0  --max-tokens 5000 --log-format json --log-interval 1 --criterion cross_entropy_acc --user-dir examples/speech_recognition/
+```
+
+## Inference for librispeech
+`$SET` can be `test_clean` or `test_other`
+Any checkpoint in `$MODEL_PATH` can be selected. In this example we are working with `checkpoint_last.pt`
+```
+python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --max-tokens 25000 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --beam 20 --results-path $RES_DIR --batch-size 40 --gen-subset $SET --user-dir examples/speech_recognition/
+```
+
+## Inference for librispeech
+```
+sclite -r ${RES_DIR}/ref.word-checkpoint_last.pt-${SET}.txt -h ${RES_DIR}/hypo.word-checkpoint_last.pt-${SET}.txt -i rm -o all stdout > $RES_REPORT
+```
+`Sum/Avg` row from first table of the report has WER
+
+## Using flashlight (previously called [wav2letter](https://github.com/facebookresearch/wav2letter)) components
+[flashlight](https://github.com/facebookresearch/flashlight) now has integration with fairseq. Currently this includes:
+
+* AutoSegmentationCriterion (ASG)
+* flashlight-style Conv/GLU model
+* flashlight's beam search decoder
+
+To use these, follow the instructions on [this page](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) to install python bindings.
+
+## Training librispeech data (flashlight style, Conv/GLU + ASG loss)
+Training command:
+```
+python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 100 --task speech_recognition --arch w2l_conv_glu_enc --batch-size 4 --optimizer sgd --lr 0.3,0.8 --momentum 0.8 --clip-norm 0.2 --max-tokens 50000 --log-format json --log-interval 100 --num-workers 0 --sentence-avg --criterion asg_loss --asg-transitions-init 5 --max-replabel 2 --linseg-updates 8789 --user-dir examples/speech_recognition
+```
+
+Note that ASG loss currently doesn't do well with word-pieces. You should prepare a dataset with character targets by setting `nbpe=31` in `prepare-librispeech.sh`.
+
+## Inference for librispeech (flashlight decoder, n-gram LM)
+Inference command:
+```
+python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder kenlm --kenlm-model $KENLM_MODEL_PATH --lexicon $LEXICON_PATH --beam 200 --beam-threshold 15 --lm-weight 1.5 --word-score 1.5 --sil-weight -0.3 --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition
+```
+
+`$KENLM_MODEL_PATH` should be a standard n-gram language model file. `$LEXICON_PATH` should be a flashlight-style lexicon (list of known words and their spellings). For ASG inference, a lexicon line should look like this (note the repetition labels):
+```
+doorbell  D O 1 R B E L 1 ▁
+```
+For CTC inference with word-pieces, repetition labels are not used and the lexicon should have most common spellings for each word (one can use sentencepiece's `NBestEncodeAsPieces` for this):
+```
+doorbell  ▁DOOR BE LL
+doorbell  ▁DOOR B E LL
+doorbell  ▁DO OR BE LL
+doorbell  ▁DOOR B EL L
+doorbell  ▁DOOR BE L L
+doorbell  ▁DO OR B E LL
+doorbell  ▁DOOR B E L L
+doorbell  ▁DO OR B EL L
+doorbell  ▁DO O R BE LL
+doorbell  ▁DO OR BE L L
+```
+Lowercase vs. uppercase matters: the *word* should match the case of the n-gram language model (i.e. `$KENLM_MODEL_PATH`), while the *spelling* should match the case of the token dictionary (i.e. `$DIR_FOR_PREPROCESSED_DATA/dict.txt`).
+
+## Inference for librispeech (flashlight decoder, viterbi only)
+Inference command:
+```
+python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder viterbi --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition
+```
--- a/triton_model/1/aqc/examples/speech_recognition/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/__init__.py
+from . import criterions, models, tasks  # noqa
--- a/triton_model/1/aqc/examples/speech_recognition/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/__pycache__/w2l_decoder.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/__pycache__/w2l_decoder.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/ASG_loss.py
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/ASG_loss.py
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from examples.speech_recognition.data.replabels import pack_replabels
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion("asg_loss")
+class ASGCriterion(FairseqCriterion):
+    @staticmethod
+    def add_args(parser):
+        group = parser.add_argument_group("ASG Loss")
+        group.add_argument(
+            "--asg-transitions-init",
+            help="initial diagonal value of transition matrix",
+            type=float,
+            default=0.0,
+        )
+        group.add_argument(
+            "--max-replabel", help="maximum # of replabels", type=int, default=2
+        )
+        group.add_argument(
+            "--linseg-updates",
+            help="# of training updates to use LinSeg initialization",
+            type=int,
+            default=0,
+        )
+        group.add_argument(
+            "--hide-linseg-messages",
+            help="hide messages about LinSeg initialization",
+            action="store_true",
+        )
+
+    def __init__(
+        self,
+        task,
+        silence_token,
+        asg_transitions_init,
+        max_replabel,
+        linseg_updates,
+        hide_linseg_messages,
+    ):
+        from flashlight.lib.sequence.criterion import ASGLoss, CriterionScaleMode
+
+        super().__init__(task)
+        self.tgt_dict = task.target_dictionary
+        self.eos = self.tgt_dict.eos()
+        self.silence = (
+            self.tgt_dict.index(silence_token)
+            if silence_token in self.tgt_dict
+            else None
+        )
+        self.max_replabel = max_replabel
+
+        num_labels = len(self.tgt_dict)
+        self.asg = ASGLoss(num_labels, scale_mode=CriterionScaleMode.TARGET_SZ_SQRT)
+        self.asg.trans = torch.nn.Parameter(
+            asg_transitions_init * torch.eye(num_labels), requires_grad=True
+        )
+
+        self.linseg_progress = torch.nn.Parameter(
+            torch.tensor([0], dtype=torch.int), requires_grad=False
+        )
+        self.linseg_maximum = linseg_updates
+        self.linseg_message_state = "none" if hide_linseg_messages else "start"
+
+    @classmethod
+    def build_criterion(cls, args, task):
+        return cls(
+            task,
+            args.silence_token,
+            args.asg_transitions_init,
+            args.max_replabel,
+            args.linseg_updates,
+            args.hide_linseg_messages,
+        )
+
+    def linseg_step(self):
+        if not self.training:
+            return False
+        if self.linseg_progress.item() < self.linseg_maximum:
+            if self.linseg_message_state == "start":
+                print("| using LinSeg to initialize ASG")
+                self.linseg_message_state = "finish"
+            self.linseg_progress.add_(1)
+            return True
+        elif self.linseg_message_state == "finish":
+            print("| finished LinSeg initialization")
+            self.linseg_message_state = "none"
+        return False
+
+    def replace_eos_with_silence(self, tgt):
+        if tgt[-1] != self.eos:
+            return tgt
+        elif self.silence is None or (len(tgt) > 1 and tgt[-2] == self.silence):
+            return tgt[:-1]
+        else:
+            return tgt[:-1] + [self.silence]
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+
+        net_output = model(**sample["net_input"])
+        emissions = net_output["encoder_out"].transpose(0, 1).contiguous()
+        B = emissions.size(0)
+        T = emissions.size(1)
+        device = emissions.device
+
+        target = torch.IntTensor(B, T)
+        target_size = torch.IntTensor(B)
+        using_linseg = self.linseg_step()
+
+        for b in range(B):
+            initial_target_size = sample["target_lengths"][b].item()
+            if initial_target_size == 0:
+                raise ValueError("target size cannot be zero")
+
+            tgt = sample["target"][b, :initial_target_size].tolist()
+            tgt = self.replace_eos_with_silence(tgt)
+            tgt = pack_replabels(tgt, self.tgt_dict, self.max_replabel)
+            tgt = tgt[:T]
+
+            if using_linseg:
+                tgt = [tgt[t * len(tgt) // T] for t in range(T)]
+
+            target[b][: len(tgt)] = torch.IntTensor(tgt)
+            target_size[b] = len(tgt)
+
+        loss = self.asg.forward(emissions, target.to(device), target_size.to(device))
+
+        if reduce:
+            loss = torch.sum(loss)
+
+        sample_size = (
+            sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / nsentences,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        return agg_output
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/__init__.py
+import importlib
+import os
+
+
+# ASG loss requires flashlight bindings
+files_to_skip = set()
+try:
+    import flashlight.lib.sequence.criterion
+except ImportError:
+    files_to_skip.add("ASG_loss.py")
+
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_") and file not in files_to_skip:
+        criterion_name = file[: file.find(".py")]
+        importlib.import_module(
+            "examples.speech_recognition.criterions." + criterion_name
+        )
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/ASG_loss.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/ASG_loss.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/cross_entropy_acc.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/cross_entropy_acc.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/cross_entropy_acc.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/__pycache__/cross_entropy_acc.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/criterions/cross_entropy_acc.py
+++ b/triton_model/1/aqc/examples/speech_recognition/criterions/cross_entropy_acc.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion("cross_entropy_acc")
+class CrossEntropyWithAccCriterion(FairseqCriterion):
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+
+    def compute_loss(self, model, net_output, target, reduction, log_probs):
+        # N, T -> N * T
+        target = target.view(-1)
+        lprobs = model.get_normalized_probs(net_output, log_probs=log_probs)
+        if not hasattr(lprobs, "batch_first"):
+            logging.warning(
+                "ERROR: we need to know whether "
+                "batch first for the net output; "
+                "you need to set batch_first attribute for the return value of "
+                "model.get_normalized_probs. Now, we assume this is true, but "
+                "in the future, we will raise exception instead. "
+            )
+        batch_first = getattr(lprobs, "batch_first", True)
+        if not batch_first:
+            lprobs = lprobs.transpose(0, 1)
+
+        # N, T, D -> N * T, D
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        loss = F.nll_loss(
+            lprobs, target, ignore_index=self.padding_idx, reduction=reduction
+        )
+        return lprobs, loss
+
+    def get_logging_output(self, sample, target, lprobs, loss):
+        target = target.view(-1)
+        mask = target != self.padding_idx
+        correct = torch.sum(
+            lprobs.argmax(1).masked_select(mask) == target.masked_select(mask)
+        )
+        total = torch.sum(mask)
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+
+        logging_output = {
+            "loss": utils.item(loss.data),  # * sample['ntokens'],
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "correct": utils.item(correct.data),
+            "total": utils.item(total.data),
+            "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
+        }
+
+        return sample_size, logging_output
+
+    def forward(self, model, sample, reduction="sum", log_probs=True):
+        """Computes the cross entropy with accuracy metric for the given sample.
+
+        This is similar to CrossEntropyCriterion in fairseq, but also
+        computes accuracy metrics as part of logging
+
+        Args:
+            logprobs (Torch.tensor) of shape N, T, D i.e.
+                batchsize, timesteps, dimensions
+            targets (Torch.tensor) of shape N, T  i.e batchsize, timesteps
+
+        Returns:
+        tuple: With three elements:
+            1) the loss
+            2) the sample size, which is used as the denominator for the gradient
+            3) logging outputs to display while training
+
+        TODO:
+            * Currently this Criterion will only work with LSTMEncoderModels or
+            FairseqModels which have decoder, or Models which return TorchTensor
+            as net_output.
+            We need to make a change to support all FairseqEncoder models.
+        """
+        net_output = model(**sample["net_input"])
+        target = model.get_targets(sample, net_output)
+        lprobs, loss = self.compute_loss(
+            model, net_output, target, reduction, log_probs
+        )
+        sample_size, logging_output = self.get_logging_output(
+            sample, target, lprobs, loss
+        )
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        correct_sum = sum(log.get("correct", 0) for log in logging_outputs)
+        total_sum = sum(log.get("total", 0) for log in logging_outputs)
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        nframes = sum(log.get("nframes", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            # if args.sentence_avg, then sample_size is nsentences, then loss
+            # is per-sentence loss; else sample_size is ntokens, the loss
+            # becomes per-output token loss
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "nframes": nframes,
+            "sample_size": sample_size,
+            "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0,
+            "correct": correct_sum,
+            "total": total_sum,
+            # total is the number of validate tokens
+        }
+        if sample_size != ntokens:
+            agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
+        # loss: per output token loss
+        # nll_loss: per sentence loss
+        return agg_output
--- a/triton_model/1/aqc/examples/speech_recognition/data/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .asr_dataset import AsrDataset
+
+
+__all__ = [
+    "AsrDataset",
+]
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/asr_dataset.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/asr_dataset.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/asr_dataset.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/asr_dataset.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/collaters.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/collaters.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/collaters.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/collaters.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/data_utils.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/data_utils.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/data_utils.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/data_utils.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/replabels.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/replabels.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/replabels.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/data/__pycache__/replabels.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/data/asr_dataset.py
+++ b/triton_model/1/aqc/examples/speech_recognition/data/asr_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import numpy as np
+from fairseq.data import FairseqDataset
+
+from . import data_utils
+from .collaters import Seq2SeqCollater
+
+
+class AsrDataset(FairseqDataset):
+    """
+    A dataset representing speech and corresponding transcription.
+
+    Args:
+        aud_paths: (List[str]): A list of str with paths to audio files.
+        aud_durations_ms (List[int]): A list of int containing the durations of
+            audio files.
+        tgt (List[torch.LongTensor]): A list of LongTensors containing the indices
+            of target transcriptions.
+        tgt_dict (~fairseq.data.Dictionary): target vocabulary.
+        ids (List[str]): A list of utterance IDs.
+        speakers (List[str]): A list of speakers corresponding to utterances.
+        num_mel_bins (int): Number of triangular mel-frequency bins (default: 80)
+        frame_length (float): Frame length in milliseconds (default: 25.0)
+        frame_shift (float): Frame shift in milliseconds (default: 10.0)
+    """
+
+    def __init__(
+        self,
+        aud_paths,
+        aud_durations_ms,
+        tgt,
+        tgt_dict,
+        ids,
+        speakers,
+        num_mel_bins=80,
+        frame_length=25.0,
+        frame_shift=10.0,
+    ):
+        assert frame_length > 0
+        assert frame_shift > 0
+        assert all(x > frame_length for x in aud_durations_ms)
+        self.frame_sizes = [
+            int(1 + (d - frame_length) / frame_shift) for d in aud_durations_ms
+        ]
+
+        assert len(aud_paths) > 0
+        assert len(aud_paths) == len(aud_durations_ms)
+        assert len(aud_paths) == len(tgt)
+        assert len(aud_paths) == len(ids)
+        assert len(aud_paths) == len(speakers)
+        self.aud_paths = aud_paths
+        self.tgt_dict = tgt_dict
+        self.tgt = tgt
+        self.ids = ids
+        self.speakers = speakers
+        self.num_mel_bins = num_mel_bins
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+
+        self.s2s_collater = Seq2SeqCollater(
+            0,
+            1,
+            pad_index=self.tgt_dict.pad(),
+            eos_index=self.tgt_dict.eos(),
+            move_eos_to_beginning=True,
+        )
+
+    def __getitem__(self, index):
+        import torchaudio
+        import torchaudio.compliance.kaldi as kaldi
+
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+
+        path = self.aud_paths[index]
+        if not os.path.exists(path):
+            raise FileNotFoundError("Audio file not found: {}".format(path))
+        sound, sample_rate = torchaudio.load_wav(path)
+        output = kaldi.fbank(
+            sound,
+            num_mel_bins=self.num_mel_bins,
+            frame_length=self.frame_length,
+            frame_shift=self.frame_shift,
+        )
+        output_cmvn = data_utils.apply_mv_norm(output)
+
+        return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
+
+    def __len__(self):
+        return len(self.aud_paths)
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[int]): sample indices to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        return self.s2s_collater.collate(samples)
+
+    def num_tokens(self, index):
+        return self.frame_sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return (
+            self.frame_sizes[index],
+            len(self.tgt[index]) if self.tgt is not None else 0,
+        )
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self))
--- a/triton_model/1/aqc/examples/speech_recognition/data/collaters.py
+++ b/triton_model/1/aqc/examples/speech_recognition/data/collaters.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+    This module contains collection of classes which implement
+    collate functionalities for various tasks.
+
+    Collaters should know what data to expect for each sample
+    and they should pack / collate them into batches
+"""
+
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import numpy as np
+import torch
+from fairseq.data import data_utils as fairseq_data_utils
+
+
+class Seq2SeqCollater(object):
+    """
+    Implements collate function mainly for seq2seq tasks
+    This expects each sample to contain feature (src_tokens) and
+    targets.
+    This collator is also used for aligned training task.
+    """
+
+    def __init__(
+        self,
+        feature_index=0,
+        label_index=1,
+        pad_index=1,
+        eos_index=2,
+        move_eos_to_beginning=True,
+    ):
+        self.feature_index = feature_index
+        self.label_index = label_index
+        self.pad_index = pad_index
+        self.eos_index = eos_index
+        self.move_eos_to_beginning = move_eos_to_beginning
+
+    def _collate_frames(self, frames):
+        """Convert a list of 2d frames into a padded 3d tensor
+        Args:
+            frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is
+                length of i-th frame and f_dim is static dimension of features
+        Returns:
+            3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
+        """
+        len_max = max(frame.size(0) for frame in frames)
+        f_dim = frames[0].size(1)
+        res = frames[0].new(len(frames), len_max, f_dim).fill_(0.0)
+
+        for i, v in enumerate(frames):
+            res[i, : v.size(0)] = v
+
+        return res
+
+    def collate(self, samples):
+        """
+        utility function to collate samples into batch for speech recognition.
+        """
+        if len(samples) == 0:
+            return {}
+
+        # parse samples into torch tensors
+        parsed_samples = []
+        for s in samples:
+            # skip invalid samples
+            if s["data"][self.feature_index] is None:
+                continue
+            source = s["data"][self.feature_index]
+            if isinstance(source, (np.ndarray, np.generic)):
+                source = torch.from_numpy(source)
+            target = s["data"][self.label_index]
+            if isinstance(target, (np.ndarray, np.generic)):
+                target = torch.from_numpy(target).long()
+            elif isinstance(target, list):
+                target = torch.LongTensor(target)
+
+            parsed_sample = {"id": s["id"], "source": source, "target": target}
+            parsed_samples.append(parsed_sample)
+        samples = parsed_samples
+
+        id = torch.LongTensor([s["id"] for s in samples])
+        frames = self._collate_frames([s["source"] for s in samples])
+        # sort samples by descending number of frames
+        frames_lengths = torch.LongTensor([s["source"].size(0) for s in samples])
+        frames_lengths, sort_order = frames_lengths.sort(descending=True)
+        id = id.index_select(0, sort_order)
+        frames = frames.index_select(0, sort_order)
+
+        target = None
+        target_lengths = None
+        prev_output_tokens = None
+        if samples[0].get("target", None) is not None:
+            ntokens = sum(len(s["target"]) for s in samples)
+            target = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            target = target.index_select(0, sort_order)
+            target_lengths = torch.LongTensor(
+                [s["target"].size(0) for s in samples]
+            ).index_select(0, sort_order)
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=self.move_eos_to_beginning,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+        else:
+            ntokens = sum(len(s["source"]) for s in samples)
+
+        batch = {
+            "id": id,
+            "ntokens": ntokens,
+            "net_input": {"src_tokens": frames, "src_lengths": frames_lengths},
+            "target": target,
+            "target_lengths": target_lengths,
+            "nsentences": len(samples),
+        }
+        if prev_output_tokens is not None:
+            batch["net_input"]["prev_output_tokens"] = prev_output_tokens
+        return batch
--- a/triton_model/1/aqc/examples/speech_recognition/data/data_utils.py
+++ b/triton_model/1/aqc/examples/speech_recognition/data/data_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def calc_mean_invstddev(feature):
+    if len(feature.size()) != 2:
+        raise ValueError("We expect the input feature to be 2-D tensor")
+    mean = feature.mean(0)
+    var = feature.var(0)
+    # avoid division by ~zero
+    eps = 1e-8
+    if (var < eps).any():
+        return mean, 1.0 / (torch.sqrt(var) + eps)
+    return mean, 1.0 / torch.sqrt(var)
+
+
+def apply_mv_norm(features):
+    # If there is less than 2 spectrograms, the variance cannot be computed (is NaN)
+    # and normalization is not possible, so return the item as it is
+    if features.size(0) < 2:
+        return features
+    mean, invstddev = calc_mean_invstddev(features)
+    res = (features - mean) * invstddev
+    return res
+
+
+def lengths_to_encoder_padding_mask(lengths, batch_first=False):
+    """
+    convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor
+
+    Args:
+        lengths: a (B, )-shaped tensor
+
+    Return:
+        max_length: maximum length of B sequences
+        encoder_padding_mask: a (max_length, B) binary mask, where
+        [t, b] = 0 for t < lengths[b] and 1 otherwise
+
+    TODO:
+        kernelize this function if benchmarking shows this function is slow
+    """
+    max_lengths = torch.max(lengths).item()
+    bsz = lengths.size(0)
+    encoder_padding_mask = torch.arange(
+        max_lengths
+    ).to(  # a (T, ) tensor with [0, ..., T-1]
+        lengths.device
+    ).view(  # move to the right device
+        1, max_lengths
+    ).expand(  # reshape to (1, T)-shaped tensor
+        bsz, -1
+    ) >= lengths.view(  # expand to (B, T)-shaped tensor
+        bsz, 1
+    ).expand(
+        -1, max_lengths
+    )
+    if not batch_first:
+        return encoder_padding_mask.t(), max_lengths
+    else:
+        return encoder_padding_mask, max_lengths
+
+
+def encoder_padding_mask_to_lengths(
+    encoder_padding_mask, max_lengths, batch_size, device
+):
+    """
+    convert encoder_padding_mask (2-D binary tensor) to a 1-D tensor
+
+    Conventionally, encoder output contains a encoder_padding_mask, which is
+    a 2-D mask in a shape (T, B), whose (t, b) element indicate whether
+    encoder_out[t, b] is a valid output (=0) or not (=1). Occasionally, we
+    need to convert this mask tensor to a 1-D tensor in shape (B, ), where
+    [b] denotes the valid length of b-th sequence
+
+    Args:
+        encoder_padding_mask: a (T, B)-shaped binary tensor or None; if None,
+        indicating all are valid
+    Return:
+        seq_lengths: a (B,)-shaped tensor, where its (b, )-th element is the
+        number of valid elements of b-th sequence
+
+        max_lengths: maximum length of all sequence, if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(0)
+
+        batch_size: batch size; if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(1)
+
+        device: which device to put the result on
+    """
+    if encoder_padding_mask is None:
+        return torch.Tensor([max_lengths] * batch_size).to(torch.int32).to(device)
+
+    assert encoder_padding_mask.size(0) == max_lengths, "max_lengths does not match"
+    assert encoder_padding_mask.size(1) == batch_size, "batch_size does not match"
+
+    return max_lengths - torch.sum(encoder_padding_mask, dim=0)
--- a/triton_model/1/aqc/examples/speech_recognition/data/replabels.py
+++ b/triton_model/1/aqc/examples/speech_recognition/data/replabels.py
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Replabel transforms for use with flashlight's ASG criterion.
+"""
+
+
+def replabel_symbol(i):
+    """
+    Replabel symbols used in flashlight, currently just "1", "2", ...
+    This prevents training with numeral tokens, so this might change in the future
+    """
+    return str(i)
+
+
+def pack_replabels(tokens, dictionary, max_reps):
+    """
+    Pack a token sequence so that repeated symbols are replaced by replabels
+    """
+    if len(tokens) == 0 or max_reps <= 0:
+        return tokens
+
+    replabel_value_to_idx = [0] * (max_reps + 1)
+    for i in range(1, max_reps + 1):
+        replabel_value_to_idx[i] = dictionary.index(replabel_symbol(i))
+
+    result = []
+    prev_token = -1
+    num_reps = 0
+    for token in tokens:
+        if token == prev_token and num_reps < max_reps:
+            num_reps += 1
+        else:
+            if num_reps > 0:
+                result.append(replabel_value_to_idx[num_reps])
+                num_reps = 0
+            result.append(token)
+            prev_token = token
+    if num_reps > 0:
+        result.append(replabel_value_to_idx[num_reps])
+    return result
+
+
+def unpack_replabels(tokens, dictionary, max_reps):
+    """
+    Unpack a token sequence so that replabels are replaced by repeated symbols
+    """
+    if len(tokens) == 0 or max_reps <= 0:
+        return tokens
+
+    replabel_idx_to_value = {}
+    for i in range(1, max_reps + 1):
+        replabel_idx_to_value[dictionary.index(replabel_symbol(i))] = i
+
+    result = []
+    prev_token = -1
+    for token in tokens:
+        try:
+            for _ in range(replabel_idx_to_value[token]):
+                result.append(prev_token)
+            prev_token = -1
+        except KeyError:
+            result.append(token)
+            prev_token = token
+    return result
--- a/triton_model/1/aqc/examples/speech_recognition/datasets/asr_prep_json.py
+++ b/triton_model/1/aqc/examples/speech_recognition/datasets/asr_prep_json.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import concurrent.futures
+import json
+import multiprocessing
+import os
+from collections import namedtuple
+from itertools import chain
+
+import sentencepiece as spm
+from fairseq.data import Dictionary
+
+
+MILLISECONDS_TO_SECONDS = 0.001
+
+
+def process_sample(aud_path, lable, utt_id, sp, tgt_dict):
+    import torchaudio
+
+    input = {}
+    output = {}
+    si, ei = torchaudio.info(aud_path)
+    input["length_ms"] = int(
+        si.length / si.channels / si.rate / MILLISECONDS_TO_SECONDS
+    )
+    input["path"] = aud_path
+
+    token = " ".join(sp.EncodeAsPieces(lable))
+    ids = tgt_dict.encode_line(token, append_eos=False)
+    output["text"] = lable
+    output["token"] = token
+    output["tokenid"] = ", ".join(map(str, [t.tolist() for t in ids]))
+    return {utt_id: {"input": input, "output": output}}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audio-dirs",
+        nargs="+",
+        default=["-"],
+        required=True,
+        help="input directories with audio files",
+    )
+    parser.add_argument(
+        "--labels",
+        required=True,
+        help="aggregated input labels with format <ID LABEL> per line",
+        type=argparse.FileType("r", encoding="UTF-8"),
+    )
+    parser.add_argument(
+        "--spm-model",
+        required=True,
+        help="sentencepiece model to use for encoding",
+        type=argparse.FileType("r", encoding="UTF-8"),
+    )
+    parser.add_argument(
+        "--dictionary",
+        required=True,
+        help="file to load fairseq dictionary from",
+        type=argparse.FileType("r", encoding="UTF-8"),
+    )
+    parser.add_argument("--audio-format", choices=["flac", "wav"], default="wav")
+    parser.add_argument(
+        "--output",
+        required=True,
+        type=argparse.FileType("w"),
+        help="path to save json output",
+    )
+    args = parser.parse_args()
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.spm_model.name)
+
+    tgt_dict = Dictionary.load(args.dictionary)
+
+    labels = {}
+    for line in args.labels:
+        (utt_id, label) = line.split(" ", 1)
+        labels[utt_id] = label
+    if len(labels) == 0:
+        raise Exception("No labels found in ", args.labels_path)
+
+    Sample = namedtuple("Sample", "aud_path utt_id")
+    samples = []
+    for path, _, files in chain.from_iterable(
+        os.walk(path) for path in args.audio_dirs
+    ):
+        for f in files:
+            if f.endswith(args.audio_format):
+                if len(os.path.splitext(f)) != 2:
+                    raise Exception("Expect <utt_id.extension> file name. Got: ", f)
+                utt_id = os.path.splitext(f)[0]
+                if utt_id not in labels:
+                    continue
+                samples.append(Sample(os.path.join(path, f), utt_id))
+
+    utts = {}
+    num_cpu = multiprocessing.cpu_count()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpu) as executor:
+        future_to_sample = {
+            executor.submit(
+                process_sample, s.aud_path, labels[s.utt_id], s.utt_id, sp, tgt_dict
+            ): s
+            for s in samples
+        }
+        for future in concurrent.futures.as_completed(future_to_sample):
+            try:
+                data = future.result()
+            except Exception as exc:
+                print("generated an exception: ", exc)
+            else:
+                utts.update(data)
+    json.dump({"utts": utts}, args.output, indent=4)
+
+
+if __name__ == "__main__":
+    main()
--- a/triton_model/1/aqc/examples/speech_recognition/datasets/prepare-librispeech.sh
+++ b/triton_model/1/aqc/examples/speech_recognition/datasets/prepare-librispeech.sh
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Prepare librispeech dataset
+
+base_url=www.openslr.org/resources/12
+train_dir=train_960
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <download_dir> <out_dir>"
+  echo "e.g.: $0 /tmp/librispeech_raw/ ~/data/librispeech_final"
+  exit 1
+fi
+
+download_dir=${1%/}
+out_dir=${2%/}
+
+fairseq_root=~/fairseq-py/
+mkdir -p ${out_dir}
+cd ${out_dir} || exit
+
+nbpe=5000
+bpemode=unigram
+
+if [ ! -d "$fairseq_root" ]; then
+    echo "$0: Please set correct fairseq_root"
+    exit 1
+fi
+
+echo "Data Download"
+for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+    url=$base_url/$part.tar.gz
+    if ! wget -P $download_dir $url; then
+        echo "$0: wget failed for $url"
+        exit 1
+    fi
+    if ! tar -C $download_dir -xvzf $download_dir/$part.tar.gz; then
+        echo "$0: error un-tarring archive $download_dir/$part.tar.gz"
+        exit 1
+    fi
+done
+
+echo "Merge all train packs into one"
+mkdir -p ${download_dir}/LibriSpeech/${train_dir}/
+for part in train-clean-100 train-clean-360 train-other-500; do
+    mv ${download_dir}/LibriSpeech/${part}/* $download_dir/LibriSpeech/${train_dir}/
+done
+echo "Merge train text"
+find ${download_dir}/LibriSpeech/${train_dir}/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/${train_dir}/text
+
+# Use combined dev-clean and dev-other as validation set
+find ${download_dir}/LibriSpeech/dev-clean/ ${download_dir}/LibriSpeech/dev-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/valid_text
+find ${download_dir}/LibriSpeech/test-clean/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-clean/text
+find ${download_dir}/LibriSpeech/test-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-other/text
+
+
+dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_units.txt
+encoded=data/lang_char/${train_dir}_${bpemode}${nbpe}_encoded.txt
+fairseq_dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_fairseq_dict.txt
+bpemodel=data/lang_char/${train_dir}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+echo "Dictionary preparation"
+mkdir -p data/lang_char/
+echo "<unk> 3" > ${dict}
+echo "</s> 2" >> ${dict}
+echo "<pad> 1" >> ${dict}
+cut -f 2- -d" " ${download_dir}/LibriSpeech/${train_dir}/text > data/lang_char/input.txt
+spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --unk_id=3 --eos_id=2 --pad_id=1 --bos_id=-1 --character_coverage=1
+spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt > ${encoded}
+cat ${encoded} | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+3}' >> ${dict}
+cat ${encoded} | tr ' ' '\n' | sort | uniq -c | awk '{print $2 " " $1}' > ${fairseq_dict}
+wc -l ${dict}
+
+echo "Prepare train and test jsons"
+for part in train_960 test-other test-clean; do
+    python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/${part} --labels ${download_dir}/LibriSpeech/${part}/text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output ${part}.json
+done
+# fairseq expects to find train.json and valid.json during training
+mv train_960.json train.json
+
+echo "Prepare valid json"
+python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/dev-clean ${download_dir}/LibriSpeech/dev-other --labels ${download_dir}/LibriSpeech/valid_text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output valid.json
+
+cp ${fairseq_dict} ./dict.txt
+cp ${bpemodel}.model ./spm.model
--- a/triton_model/1/aqc/examples/speech_recognition/infer.py
+++ b/triton_model/1/aqc/examples/speech_recognition/infer.py
--- a/triton_model/1/aqc/examples/speech_recognition/kaldi/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/kaldi/__init__.py
--- a/triton_model/1/aqc/examples/speech_recognition/kaldi/add-self-loop-simple.cc
+++ b/triton_model/1/aqc/examples/speech_recognition/kaldi/add-self-loop-simple.cc
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <iostream>
+#include "fstext/fstext-lib.h" // @manual
+#include "util/common-utils.h" // @manual
+
+/*
+ * This program is to modify a FST without self-loop by:
+ *   for each incoming arc with non-eps input symbol, add a self-loop arc
+ *   with that non-eps symbol as input and eps as output.
+ *
+ * This is to make sure the resultant FST can do deduplication for repeated
+ * symbols, which is very common in acoustic model
+ *
+ */
+namespace {
+int32 AddSelfLoopsSimple(fst::StdVectorFst* fst) {
+  typedef fst::MutableArcIterator<fst::StdVectorFst> IterType;
+
+  int32 num_states_before = fst->NumStates();
+  fst::MakePrecedingInputSymbolsSame(false, fst);
+  int32 num_states_after = fst->NumStates();
+  KALDI_LOG << "There are " << num_states_before
+            << " states in the original FST; "
+            << " after MakePrecedingInputSymbolsSame, there are "
+            << num_states_after << " states " << std::endl;
+
+  auto weight_one = fst::StdArc::Weight::One();
+
+  int32 num_arc_added = 0;
+
+  fst::StdArc self_loop_arc;
+  self_loop_arc.weight = weight_one;
+
+  int32 num_states = fst->NumStates();
+  std::vector<std::set<int32>> incoming_non_eps_label_per_state(num_states);
+
+  for (int32 state = 0; state < num_states; state++) {
+    for (IterType aiter(fst, state); !aiter.Done(); aiter.Next()) {
+      fst::StdArc arc(aiter.Value());
+      if (arc.ilabel != 0) {
+        incoming_non_eps_label_per_state[arc.nextstate].insert(arc.ilabel);
+      }
+    }
+  }
+
+  for (int32 state = 0; state < num_states; state++) {
+    if (!incoming_non_eps_label_per_state[state].empty()) {
+      auto& ilabel_set = incoming_non_eps_label_per_state[state];
+      for (auto it = ilabel_set.begin(); it != ilabel_set.end(); it++) {
+        self_loop_arc.ilabel = *it;
+        self_loop_arc.olabel = 0;
+        self_loop_arc.nextstate = state;
+        fst->AddArc(state, self_loop_arc);
+        num_arc_added++;
+      }
+    }
+  }
+  return num_arc_added;
+}
+
+void print_usage() {
+  std::cout << "add-self-loop-simple usage:\n"
+               "\tadd-self-loop-simple <in-fst> <out-fst> \n";
+}
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    print_usage();
+    exit(1);
+  }
+
+  auto input = argv[1];
+  auto output = argv[2];
+
+  auto fst = fst::ReadFstKaldi(input);
+  auto num_states = fst->NumStates();
+  KALDI_LOG << "Loading FST from " << input << " with " << num_states
+            << " states." << std::endl;
+
+  int32 num_arc_added = AddSelfLoopsSimple(fst);
+  KALDI_LOG << "Adding " << num_arc_added << " self-loop arcs " << std::endl;
+
+  fst::WriteFstKaldi(*fst, std::string(output));
+  KALDI_LOG << "Writing FST to " << output << std::endl;
+
+  delete fst;
+}
--- a/triton_model/1/aqc/examples/speech_recognition/kaldi/config/kaldi_initializer.yaml
+++ b/triton_model/1/aqc/examples/speech_recognition/kaldi/config/kaldi_initializer.yaml
+# @package _group_
+
+data_dir: ???
+fst_dir: ???
+in_labels: ???
+kaldi_root: ???
+lm_arpa: ???
+blank_symbol: <s>
--- a/triton_model/1/aqc/examples/speech_recognition/kaldi/kaldi_decoder.py
+++ b/triton_model/1/aqc/examples/speech_recognition/kaldi/kaldi_decoder.py
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from concurrent.futures import ThreadPoolExecutor
+import logging
+from omegaconf import MISSING
+import os
+import torch
+from typing import Optional
+import warnings
+
+
+from dataclasses import dataclass
+from fairseq.dataclass import FairseqDataclass
+from .kaldi_initializer import KaldiInitializerConfig, initalize_kaldi
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class KaldiDecoderConfig(FairseqDataclass):
+    hlg_graph_path: Optional[str] = None
+    output_dict: str = MISSING
+
+    kaldi_initializer_config: Optional[KaldiInitializerConfig] = None
+
+    acoustic_scale: float = 0.5
+    max_active: int = 10000
+    beam_delta: float = 0.5
+    hash_ratio: float = 2.0
+
+    is_lattice: bool = False
+    lattice_beam: float = 10.0
+    prune_interval: int = 25
+    determinize_lattice: bool = True
+    prune_scale: float = 0.1
+    max_mem: int = 0
+    phone_determinize: bool = True
+    word_determinize: bool = True
+    minimize: bool = True
+
+    num_threads: int = 1
+
+
+class KaldiDecoder(object):
+    def __init__(
+        self,
+        cfg: KaldiDecoderConfig,
+        beam: int,
+        nbest: int = 1,
+    ):
+        try:
+            from kaldi.asr import FasterRecognizer, LatticeFasterRecognizer
+            from kaldi.base import set_verbose_level
+            from kaldi.decoder import (
+                FasterDecoder,
+                FasterDecoderOptions,
+                LatticeFasterDecoder,
+                LatticeFasterDecoderOptions,
+            )
+            from kaldi.lat.functions import DeterminizeLatticePhonePrunedOptions
+            from kaldi.fstext import read_fst_kaldi, SymbolTable
+        except:
+            warnings.warn(
+                "pykaldi is required for this functionality. Please install from https://github.com/pykaldi/pykaldi"
+            )
+
+        # set_verbose_level(2)
+
+        self.acoustic_scale = cfg.acoustic_scale
+        self.nbest = nbest
+
+        if cfg.hlg_graph_path is None:
+            assert (
+                cfg.kaldi_initializer_config is not None
+            ), "Must provide hlg graph path or kaldi initializer config"
+            cfg.hlg_graph_path = initalize_kaldi(cfg.kaldi_initializer_config)
+
+        assert os.path.exists(cfg.hlg_graph_path), cfg.hlg_graph_path
+
+        if cfg.is_lattice:
+            self.dec_cls = LatticeFasterDecoder
+            opt_cls = LatticeFasterDecoderOptions
+            self.rec_cls = LatticeFasterRecognizer
+        else:
+            assert self.nbest == 1, "nbest > 1 requires lattice decoder"
+            self.dec_cls = FasterDecoder
+            opt_cls = FasterDecoderOptions
+            self.rec_cls = FasterRecognizer
+
+        self.decoder_options = opt_cls()
+        self.decoder_options.beam = beam
+        self.decoder_options.max_active = cfg.max_active
+        self.decoder_options.beam_delta = cfg.beam_delta
+        self.decoder_options.hash_ratio = cfg.hash_ratio
+
+        if cfg.is_lattice:
+            self.decoder_options.lattice_beam = cfg.lattice_beam
+            self.decoder_options.prune_interval = cfg.prune_interval
+            self.decoder_options.determinize_lattice = cfg.determinize_lattice
+            self.decoder_options.prune_scale = cfg.prune_scale
+            det_opts = DeterminizeLatticePhonePrunedOptions()
+            det_opts.max_mem = cfg.max_mem
+            det_opts.phone_determinize = cfg.phone_determinize
+            det_opts.word_determinize = cfg.word_determinize
+            det_opts.minimize = cfg.minimize
+            self.decoder_options.det_opts = det_opts
+
+        self.output_symbols = {}
+        with open(cfg.output_dict, "r") as f:
+            for line in f:
+                items = line.rstrip().split()
+                assert len(items) == 2
+                self.output_symbols[int(items[1])] = items[0]
+
+        logger.info(f"Loading FST from {cfg.hlg_graph_path}")
+        self.fst = read_fst_kaldi(cfg.hlg_graph_path)
+        self.symbol_table = SymbolTable.read_text(cfg.output_dict)
+
+        self.executor = ThreadPoolExecutor(max_workers=cfg.num_threads)
+
+    def generate(self, models, sample, **unused):
+        """Generate a batch of inferences."""
+        # model.forward normally channels prev_output_tokens into the decoder
+        # separately, but SequenceGenerator directly calls model.encoder
+        encoder_input = {
+            k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
+        }
+        emissions, padding = self.get_emissions(models, encoder_input)
+        return self.decode(emissions, padding)
+
+    def get_emissions(self, models, encoder_input):
+        """Run encoder and normalize emissions"""
+        model = models[0]
+
+        all_encoder_out = [m(**encoder_input) for m in models]
+
+        if len(all_encoder_out) > 1:
+
+            if "encoder_out" in all_encoder_out[0]:
+                encoder_out = {
+                    "encoder_out": sum(e["encoder_out"] for e in all_encoder_out)
+                    / len(all_encoder_out),
+                    "encoder_padding_mask": all_encoder_out[0]["encoder_padding_mask"],
+                }
+                padding = encoder_out["encoder_padding_mask"]
+            else:
+                encoder_out = {
+                    "logits": sum(e["logits"] for e in all_encoder_out)
+                    / len(all_encoder_out),
+                    "padding_mask": all_encoder_out[0]["padding_mask"],
+                }
+                padding = encoder_out["padding_mask"]
+        else:
+            encoder_out = all_encoder_out[0]
+            padding = (
+                encoder_out["padding_mask"]
+                if "padding_mask" in encoder_out
+                else encoder_out["encoder_padding_mask"]
+            )
+
+        if hasattr(model, "get_logits"):
+            emissions = model.get_logits(encoder_out, normalize=True)
+        else:
+            emissions = model.get_normalized_probs(encoder_out, log_probs=True)
+
+        return (
+            emissions.cpu().float().transpose(0, 1),
+            padding.cpu() if padding is not None and padding.any() else None,
+        )
+
+    def decode_one(self, logits, padding):
+        from kaldi.matrix import Matrix
+
+        decoder = self.dec_cls(self.fst, self.decoder_options)
+        asr = self.rec_cls(
+            decoder, self.symbol_table, acoustic_scale=self.acoustic_scale
+        )
+
+        if padding is not None:
+            logits = logits[~padding]
+
+        mat = Matrix(logits.numpy())
+
+        out = asr.decode(mat)
+
+        if self.nbest > 1:
+            from kaldi.fstext import shortestpath
+            from kaldi.fstext.utils import (
+                convert_compact_lattice_to_lattice,
+                convert_lattice_to_std,
+                convert_nbest_to_list,
+                get_linear_symbol_sequence,
+            )
+
+            lat = out["lattice"]
+
+            sp = shortestpath(lat, nshortest=self.nbest)
+
+            sp = convert_compact_lattice_to_lattice(sp)
+            sp = convert_lattice_to_std(sp)
+            seq = convert_nbest_to_list(sp)
+
+            results = []
+            for s in seq:
+                _, o, w = get_linear_symbol_sequence(s)
+                words = list(self.output_symbols[z] for z in o)
+                results.append(
+                    {
+                        "tokens": words,
+                        "words": words,
+                        "score": w.value,
+                        "emissions": logits,
+                    }
+                )
+            return results
+        else:
+            words = out["text"].split()
+            return [
+                {
+                    "tokens": words,
+                    "words": words,
+                    "score": out["likelihood"],
+                    "emissions": logits,
+                }
+            ]
+
+    def decode(self, emissions, padding):
+        if padding is None:
+            padding = [None] * len(emissions)
+
+        ret = list(
+            map(
+                lambda e, p: self.executor.submit(self.decode_one, e, p),
+                emissions,
+                padding,
+            )
+        )
+        return ret
--- a/triton_model/1/aqc/examples/speech_recognition/kaldi/kaldi_initializer.py
+++ b/triton_model/1/aqc/examples/speech_recognition/kaldi/kaldi_initializer.py
--- a/triton_model/1/aqc/examples/speech_recognition/models/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/models/__init__.py
+import importlib
+import os
+
+
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        model_name = file[: file.find(".py")]
+        importlib.import_module("examples.speech_recognition.models." + model_name)
--- a/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/vggtransformer.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/vggtransformer.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/vggtransformer.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/vggtransformer.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/w2l_conv_glu_enc.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/w2l_conv_glu_enc.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/w2l_conv_glu_enc.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/models/__pycache__/w2l_conv_glu_enc.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/models/vggtransformer.py
+++ b/triton_model/1/aqc/examples/speech_recognition/models/vggtransformer.py
--- a/triton_model/1/aqc/examples/speech_recognition/models/w2l_conv_glu_enc.py
+++ b/triton_model/1/aqc/examples/speech_recognition/models/w2l_conv_glu_enc.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/README.md
+++ b/triton_model/1/aqc/examples/speech_recognition/new/README.md
--- a/triton_model/1/aqc/examples/speech_recognition/new/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/__init__.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/conf/hydra/sweeper/ax.yaml
+++ b/triton_model/1/aqc/examples/speech_recognition/new/conf/hydra/sweeper/ax.yaml
--- a/triton_model/1/aqc/examples/speech_recognition/new/conf/infer.yaml
+++ b/triton_model/1/aqc/examples/speech_recognition/new/conf/infer.yaml
--- a/triton_model/1/aqc/examples/speech_recognition/new/decoders/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/decoders/__init__.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/decoders/base_decoder.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/decoders/base_decoder.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/decoders/decoder.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/decoders/decoder.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/decoders/decoder_config.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/decoders/decoder_config.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/decoders/flashlight_decoder.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/decoders/flashlight_decoder.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/decoders/viterbi_decoder.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/decoders/viterbi_decoder.py
--- a/triton_model/1/aqc/examples/speech_recognition/new/infer.py
+++ b/triton_model/1/aqc/examples/speech_recognition/new/infer.py
--- a/triton_model/1/aqc/examples/speech_recognition/tasks/__init__.py
+++ b/triton_model/1/aqc/examples/speech_recognition/tasks/__init__.py
--- a/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/__init__.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/__init__.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/speech_recognition.cpython-310.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/speech_recognition.cpython-310.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/speech_recognition.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/speech_recognition/tasks/__pycache__/speech_recognition.cpython-39.pyc
--- a/triton_model/1/aqc/examples/speech_recognition/tasks/speech_recognition.py
+++ b/triton_model/1/aqc/examples/speech_recognition/tasks/speech_recognition.py
--- a/triton_model/1/aqc/examples/speech_recognition/utils/wer_utils.py
+++ b/triton_model/1/aqc/examples/speech_recognition/utils/wer_utils.py
--- a/triton_model/1/aqc/examples/speech_recognition/w2l_decoder.py
+++ b/triton_model/1/aqc/examples/speech_recognition/w2l_decoder.py
--- a/triton_model/1/aqc/examples/wav2vec/README.md
+++ b/triton_model/1/aqc/examples/wav2vec/README.md
--- a/triton_model/1/aqc/examples/wav2vec/__init__.py
+++ b/triton_model/1/aqc/examples/wav2vec/__init__.py
--- a/triton_model/1/aqc/examples/wav2vec/__pycache__/__init__.cpython-39.pyc
+++ b/triton_model/1/aqc/examples/wav2vec/__pycache__/__init__.cpython-39.pyc
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_100h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_100h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_10h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_10h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_10m.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_10m.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_1h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_1h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_960h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/base_960h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_100h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_100h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_10h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_10h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_10m.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_10m.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_1h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_1h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_960h.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/finetuning/vox_960h.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml
--- a/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml
--- a/triton_model/1/aqc/examples/wav2vec/libri_labels.py
+++ b/triton_model/1/aqc/examples/wav2vec/libri_labels.py
--- a/triton_model/1/aqc/examples/wav2vec/scripts/binarize_manifest.sh
+++ b/triton_model/1/aqc/examples/wav2vec/scripts/binarize_manifest.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/README.md
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/README.md
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/__init__.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/__init__.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/finetuning/w2v_finetune.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/finetuning/w2v_finetune.yaml
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/gan/w2vu.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/gan/w2vu.yaml
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/gan/w2vu2.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/gan/w2vu2.yaml
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/generate/viterbi.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/generate/viterbi.yaml
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/test.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/test.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/train.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/train.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/train_text.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/train_text.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/valid.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_matched/valid.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/test.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/test.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/train.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/train.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/data/__init__.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/data/__init__.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/data/extracted_features_dataset.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/data/extracted_features_dataset.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/data/random_input_dataset.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/data/random_input_dataset.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/README.md
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/README.md
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_phone.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_phone.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step1.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step1.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step2.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step2.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/copy_aligned_text.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/copy_aligned_text.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/decode.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/decode.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_data_from_w2v.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_data_from_w2v.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang_word.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang_word.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/path.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/path.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/train.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/train.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/utils
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/kaldi_self_train/st/utils
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/models/__init__.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/models/__init__.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/models/wav2vec_u.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/models/wav2vec_u.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/apply_pca.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/apply_pca.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/copy_labels.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/copy_labels.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/filter_lexicon.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/filter_lexicon.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/filter_tsv.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/filter_tsv.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/mean_pool.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/mean_pool.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/merge_clusters.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/merge_clusters.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/normalize_text.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/normalize_text.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/pca.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/pca.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_audio.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_audio.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_text.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_text.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_timit.sh
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/prepare_timit.sh
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/remove_silence.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/remove_silence.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/vads.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/vads.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wer.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wer.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/tasks/__init__.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/tasks/__init__.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py
--- a/triton_model/1/aqc/examples/wav2vec/unsupervised/w2vu_generate.py
+++ b/triton_model/1/aqc/examples/wav2vec/unsupervised/w2vu_generate.py
--- a/triton_model/1/aqc/examples/wav2vec/vq-wav2vec_featurize.py
+++ b/triton_model/1/aqc/examples/wav2vec/vq-wav2vec_featurize.py
--- a/triton_model/1/aqc/examples/wav2vec/wav2vec_featurize.py
+++ b/triton_model/1/aqc/examples/wav2vec/wav2vec_featurize.py
--- a/triton_model/1/aqc/examples/wav2vec/wav2vec_manifest.py
+++ b/triton_model/1/aqc/examples/wav2vec/wav2vec_manifest.py
--- a/triton_model/1/aqc/examples/wav2vec/xlsr/README.md
+++ b/triton_model/1/aqc/examples/wav2vec/xlsr/README.md
--- a/triton_model/1/aqc/examples/wav2vec/xlsr/config/finetune.yaml
+++ b/triton_model/1/aqc/examples/wav2vec/xlsr/config/finetune.yaml
--- a/triton_model/1/model.py
+++ b/triton_model/1/model.py
--- a/triton_model/config.pbtxt
+++ b/triton_model/config.pbtxt