Commit 0eb35dda authored by Nikhilesh Bhatnagar's avatar Nikhilesh Bhatnagar

Added 11 more language pairs. Support for greater than 1 sentences per...

Added 11 more language pairs. Support for greater than 1 sentences per request. Support for different language pairs in the same request.
parent 19adc2a4
ssmt_triton_repo
himangy_triton_repo
\ No newline at end of file
......@@ -11,20 +11,18 @@ Quantization disabled until qualitative testing is performed. For now, the argum
```bash
git clone https://ssmt.iiit.ac.in/meitygit/ssmt/mt-model-deploy-dhruva.git
cd mt-model-deploy-dhruva
bash make_triton_model_repo.sh "https://ssmt.iiit.ac.in/uploads/data_mining/models.zip" "int8"
docker build -t dhruva/ssmt-model-server:1 .
nvidia-docker run --gpus=all --rm --shm-size 5g --network=host --name dhruva-ssmt-triton-server -v./ssmt_triton_repo:/models dhruva/ssmt-model-server:1
bash make_triton_model_repo.sh
docker build -t dhruva/himangy-model-server:1 .
nvidia-docker run --gpus=all --rm --shm-size 5g --network=host --name dhruva-himangy-triton-server -v./himangy_triton_repo:/models dhruva/himangy-model-server:1
```
## What this repo does
* This repo contains the templates and component triton models for the SSMT project.
* Also contained is a Dockerfile to construct the triton server instance.
* Given a URL and quantization method (those supported by CTranslate2 i.e. `int8`, `int8_float16`, `int8_bfloat16`, `int16`, `float16` and `bfloat16`) it will download, quantize and construct the SSMT Triton Repository in `./ssmt_triton_repo` (int8 is the most efficient in size and speed on NVIDIA T4).
* Dynamic batching and caching is supported and enabled by default.
* The repository folder can me mounted to the dhruva ssmt triton server on `/models` and can be queried via a client.
* The repository folder can me mounted to the dhruva himangy triton server on `/models` and can be queried via a client.
* Sample client code is also given as an ipython notebook.
* The `model.zip` package needs to contain a folder of `.pt` and `.src` files named `1` through `9` with each file corresponding to the following mapping: `{'en-hi': 1, 'hi-en': 2, 'te-en': 4, 'hi-te': 6, 'te-hi': 7, 'en-gu': 8, 'gu-en': 9}`
## Architecture of the pipeline
......@@ -39,15 +37,15 @@ One can construct the triton repo like so:
```bash
git clone https://ssmt.iiit.ac.in/meitygit/ssmt/mt-model-deploy-dhruva.git
cd mt-model-deploy-dhruva
bash make_triton_model_repo.sh "https://ssmt.iiit.ac.in/uploads/data_mining/models.zip" "int8"
bash make_triton_model_repo.sh
```
## Starting the triton server
We customize the tritonserver image with the required python packages in a venv and enable the cache in the startup command. After the model repo has beeen built, one can build and run the server like so:
```bash
docker build -t dhruva/ssmt-model-server:1 .
nvidia-docker run --gpus=all --rm --shm-size 5g --network=host --name dhruva-ssmt-triton-server -v./ssmt_triton_repo:/models dhruva/ssmt-model-server:1
docker build -t dhruva/himangy-model-server:1 .
nvidia-docker run --gpus=all --rm --shm-size 5g --network=host --name dhruva-himangy-triton-server -v./himangy_triton_repo:/models dhruva/himangy-model-server:1
```
## Querying the triton server
......
This diff is collapsed.
import json
import numpy
import asyncio
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args): self.target_dtype = pb_utils.triton_string_to_numpy(pb_utils.get_output_config_by_name(json.loads(args['model_config']), 'OUTPUT_TEXT')['data_type'])
async def execute(self, requests): return [pb_utils.InferenceResponse(output_tensors=[pb_utils.Tensor('OUTPUT_TEXT', numpy.array([[pb_utils.get_output_tensor_by_name(result, 'OUTPUT_SENT').as_numpy()[0, 0].decode('utf-8')] for result in (await asyncio.gather(*awaits))], dtype=self.target_dtype))]) for awaits in [[pb_utils.InferenceRequest(model_name=f"himangy-{input_language_id[0].decode('utf-8')}-{output_language_id[0].decode('utf-8')}", requested_output_names=['OUTPUT_SENT'], inputs=[pb_utils.Tensor('INPUT_SENT_TOKENIZED', numpy.array([[input_text_tokenized[0].decode('utf-8')]], dtype='object'))]).async_exec() for input_text_tokenized, input_language_id, output_language_id in zip(pb_utils.get_input_tensor_by_name(request, 'INPUT_TEXT_TOKENIZED').as_numpy(), pb_utils.get_input_tensor_by_name(request, 'INPUT_LANGUAGE_ID').as_numpy(), pb_utils.get_input_tensor_by_name(request, 'OUTPUT_LANGUAGE_ID').as_numpy())] for request in requests]]
def finalize(self): pass
\ No newline at end of file
name: "ssmt_model_demuxer"
name: "demuxer"
backend: "python"
max_batch_size: 4096
......@@ -40,7 +40,3 @@ instance_group [
kind: KIND_CPU
}
]
\ No newline at end of file
response_cache {
enable: true
}
......@@ -8,25 +8,24 @@ import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args):
current_path = os.path.dirname(os.path.abspath(__file__))
self.source_lang, self.target_lang = input_lang, output_lang
self.model_config = json.loads(args["model_config"])
self.device_id = int(json.loads(args['model_instance_device_id']))
target_config = pb_utils.get_output_config_by_name(self.model_config, "OUTPUT_TEXT")
target_config = pb_utils.get_output_config_by_name(self.model_config, "OUTPUT_SENT")
self.target_dtype = pb_utils.triton_string_to_numpy(target_config["data_type"])
try: self.translator = Translator(f"{os.path.join(current_path, 'translator')}", device="cuda", intra_threads=1, inter_threads=1, device_index=[self.device_id])
except: self.translator = Translator(f"{os.path.join(current_path, 'translator')}", device="cpu", intra_threads=4)
def clean_output(self, text):
text = text.replace('@@ ', '')
text = text.replace('\u200c', '')
if text.startswith('<to-gu> '): text = text[8:]
if text.endswith(' <to-gu>'): text = text[:-8]
return text
def execute(self, requests):
source_list = [pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT_TOKENIZED") for request in requests]
source_list = [pb_utils.get_input_tensor_by_name(request, "INPUT_SENT_TOKENIZED") for request in requests]
bsize_list = [source.as_numpy().shape[0] for source in source_list]
src_sentences = [s[0].decode('utf-8').strip().split(' ') for source in source_list for s in source.as_numpy()]
tgt_sentences = [self.clean_output(' '.join(result.hypotheses[0])) for result in self.translator.translate_iterable(src_sentences, max_batch_size=128, max_input_length=100, max_decoding_length=100)]
responses = [pb_utils.InferenceResponse(output_tensors=[pb_utils.Tensor("OUTPUT_TEXT", numpy.array([[s]for s in islice(tgt_sentences, bsize)], dtype='object').astype(self.target_dtype))]) for bsize in bsize_list]
responses = [pb_utils.InferenceResponse(output_tensors=[pb_utils.Tensor("OUTPUT_SENT", numpy.array([[s]for s in islice(tgt_sentences, bsize)], dtype='object').astype(self.target_dtype))]) for bsize in bsize_list]
return responses
def finalize(self): self.translator.unload_model()
\ No newline at end of file
......@@ -4,14 +4,15 @@ max_batch_size: 512
input [
{
name: "INPUT_TEXT_TOKENIZED"
name: "INPUT_SENT_TOKENIZED"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
output [
{
name: "OUTPUT_TEXT"
name: "OUTPUT_SENT"
data_type: TYPE_STRING
dims: [ 1 ]
}
......
import os
import json
import numpy
from itertools import islice
from argparse import Namespace
import triton_python_backend_utils as pb_utils
from onmt.translate.translator import build_translator
class TritonPythonModel:
def initialize(self, args):
current_path = os.path.dirname(os.path.abspath(__file__))
self.source_lang, self.target_lang = input_lang, output_lang
self.model_config = json.loads(args["model_config"])
self.device_id = int(json.loads(args['model_instance_device_id']))
target_config = pb_utils.get_output_config_by_name(self.model_config, "OUTPUT_SENT")
self.target_dtype = pb_utils.triton_string_to_numpy(target_config["data_type"])
try: self.translator = build_translator(Namespace(tgt_prefix=False, alpha=0.0, batch_type='sents', beam_size=5, beta=-0.0, block_ngram_repeat=0, coverage_penalty='none', data_type='text', dump_beam='', fp32=True, gpu=self.device_id, ignore_when_blocking=[], length_penalty='none', max_length=100, max_sent_length=None, min_length=0, models=[f"{os.path.join(current_path, 'translator.pt')}"], n_best=1, output='/dev/null', phrase_table='', random_sampling_temp=1.0, random_sampling_topk=1, ratio=-0.0, replace_unk=False, report_align=False, report_time=False, seed=829, stepwise_penalty=False, tgt=None, verbose=False), report_score=False)
except: self.translator = build_translator(Namespace(tgt_prefix=False, alpha=0.0, batch_type='sents', beam_size=5, beta=-0.0, block_ngram_repeat=0, coverage_penalty='none', data_type='text', dump_beam='', fp32=True, gpu=-1, ignore_when_blocking=[], length_penalty='none', max_length=100, max_sent_length=None, min_length=0, models=[f"{os.path.join(current_path, 'translator.pt')}"], n_best=1, output='/dev/null', phrase_table='', random_sampling_temp=1.0, random_sampling_topk=1, ratio=-0.0, replace_unk=False, report_align=False, report_time=False, seed=829, stepwise_penalty=False, tgt=None, verbose=False), report_score=False)
def clean_output(self, text):
text = text.replace('@@ ', '')
text = text.replace('\u200c', '')
if text.startswith('<to-gu> '): text = text[8:]
if text.endswith(' <to-gu>'): text = text[:-8]
return text
def execute(self, requests):
source_list = [pb_utils.get_input_tensor_by_name(request, "INPUT_SENT_TOKENIZED") for request in requests]
bsize_list = [source.as_numpy().shape[0] for source in source_list]
src_sentences = [s[0].decode('utf-8').strip().split(' ') for source in source_list for s in source.as_numpy()]
tgt_sentences = [self.clean_output(result[0]) for result in self.translator.translate(src_sentences, batch_size=128)[1]]
responses = [pb_utils.InferenceResponse(output_tensors=[pb_utils.Tensor("OUTPUT_SENT", numpy.array([[s]for s in islice(tgt_sentences, bsize)], dtype='object').astype(self.target_dtype))]) for bsize in bsize_list]
return responses
def finalize(self): del self.translator
\ No newline at end of file
name: "model_name"
backend: "python"
max_batch_size: 512
input [
{
name: "INPUT_SENT_TOKENIZED"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
output [
{
name: "OUTPUT_SENT"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
dynamic_batching {}
instance_group [
{
count: 1
kind: KIND_GPU
}
]
response_cache {
enable: true
}
\ No newline at end of file
......@@ -35,7 +35,7 @@ output [
ensemble_scheduling {
step [
{
model_name: "ssmt_tokenizer"
model_name: "tokenizer"
model_version: 1
input_map {
key: "INPUT_TEXT"
......@@ -55,7 +55,7 @@ ensemble_scheduling {
}
},
{
model_name: "ssmt_model_demuxer"
model_name: "demuxer"
model_version: 1
input_map {
key: "INPUT_TEXT_TOKENIZED"
......@@ -76,7 +76,3 @@ ensemble_scheduling {
}
]
}
\ No newline at end of file
response_cache {
enable: true
}
import json
import asyncio
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args):
self.model_config = json.loads(args["model_config"])
target_config = pb_utils.get_output_config_by_name(self.model_config, "OUTPUT_TEXT")
self.target_dtype = pb_utils.triton_string_to_numpy(target_config["data_type"])
self.lang_pair_map = {'en-hi': 1, 'hi-en': 2, 'te-en': 4, 'hi-te': 6, 'te-hi': 7, 'en-gu': 8, 'gu-en': 9}
async def execute(self, requests):
responses = []
infer_response_awaits = []
for request in requests:
language_pair = f"{pb_utils.get_input_tensor_by_name(request, 'INPUT_LANGUAGE_ID').as_numpy()[0, 0].decode('utf-8')}-{pb_utils.get_input_tensor_by_name(request, 'OUTPUT_LANGUAGE_ID').as_numpy()[0, 0].decode('utf-8')}"
inference_request = pb_utils.InferenceRequest(model_name=f'ssmt_{self.lang_pair_map[language_pair]}_ct2', requested_output_names=['OUTPUT_TEXT'], inputs=[pb_utils.get_input_tensor_by_name(request, 'INPUT_TEXT_TOKENIZED')])
infer_response_awaits.append(inference_request.async_exec())
responses = await asyncio.gather(*infer_response_awaits)
return responses
def finalize(self): pass
import os
import json
import numpy
from .apply_bpe import BPE
from ilstokenizer import tokenizer
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args):
current_path = os.path.dirname(os.path.abspath(__file__))
self.model_config = json.loads(args["model_config"])
target_config = pb_utils.get_output_config_by_name(self.model_config, "INPUT_TEXT_TOKENIZED")
self.target_dtype = pb_utils.triton_string_to_numpy(target_config["data_type"])
self.lang_pair_map = {'en-hi': 1, 'hi-en': 2, 'en-te': 3, 'te-en': 4, 'hi-te': 6, 'te-hi': 7, 'en-gu': 8, 'gu-en': 9}
self.bpes = {lang_pair: BPE(open(os.path.join(current_path, f'bpe_src/{model_id}.src'), encoding='utf-8')) for lang_pair, model_id in self.lang_pair_map.items()}
def tokenize_and_segment(self, input_text, source_lang, target_lang):
tokenized_text = tokenizer.tokenize(input_text)
if source_lang == 'en' and target_lang == 'gu': tokenized_text = f'<to-gu> {tokenized_text} <to-gu>'
return self.bpes[f'{source_lang}-{target_lang}'].segment(tokenized_text).strip()
def execute(self, requests):
source_gen = ((pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT"), pb_utils.get_input_tensor_by_name(request, "INPUT_LANGUAGE_ID"), pb_utils.get_input_tensor_by_name(request, "OUTPUT_LANGUAGE_ID")) for request in requests)
tokenized_gen = ((self.tokenize_and_segment(input_text[0].decode('utf-8'), input_language_id[0].decode('utf-8'), output_language_id[0].decode('utf-8')) for input_text, input_language_id, output_language_id in zip(input_texts.as_numpy(), input_language_ids.as_numpy(), output_language_ids.as_numpy())) for input_texts, input_language_ids, output_language_ids in source_gen)
responses = [pb_utils.InferenceResponse(output_tensors=[pb_utils.Tensor("INPUT_TEXT_TOKENIZED", numpy.array([[tokenized_sent] for tokenized_sent in tokenized_sents], dtype=self.target_dtype))]) for tokenized_sents in tokenized_gen]
return responses
def finalize(self): pass
import os
import json
import numpy
from glob import iglob
from .apply_bpe import BPE
from ilstokenizer import tokenizer
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args): self.target_dtype, self.bpes = pb_utils.triton_string_to_numpy(pb_utils.get_output_config_by_name(json.loads(args["model_config"]), "INPUT_TEXT_TOKENIZED")["data_type"]), {fname.rsplit('/', maxsplit=1)[-1][:-len('.src')]: BPE(open(fname, 'r', encoding='utf-8')) for fname in iglob(f'{os.path.dirname(os.path.abspath(__file__))}/bpe_src/*.src')}
def preprocess_text(self, text, source_lang, target_lang): return f"<to-gu> {text} <to-gu>" if source_lang == 'en' and target_lang == 'gu' else text
def execute(self, requests): return [pb_utils.InferenceResponse(output_tensors=[pb_utils.Tensor("INPUT_TEXT_TOKENIZED", numpy.array([[tokenized_sent] for tokenized_sent in tokenized_sents], dtype=self.target_dtype))]) for tokenized_sents in ((self.bpes[f"{input_language_id[0].decode('utf-8')}-{output_language_id[0].decode('utf-8')}"].segment(self.preprocess_text(tokenizer.tokenize(input_text[0].decode('utf-8').lower()), input_language_id[0].decode('utf-8'), output_language_id[0].decode('utf-8'))).strip() for input_text, input_language_id, output_language_id in zip(input_texts.as_numpy(), input_language_ids.as_numpy(), output_language_ids.as_numpy())) for input_texts, input_language_ids, output_language_ids in ((pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT"), pb_utils.get_input_tensor_by_name(request, "INPUT_LANGUAGE_ID"), pb_utils.get_input_tensor_by_name(request, "OUTPUT_LANGUAGE_ID")) for request in requests))]
def finalize(self): pass
\ No newline at end of file
name: "ssmt_tokenizer"
name: "tokenizer"
backend: "python"
max_batch_size: 4096
......@@ -40,7 +40,3 @@ instance_group [
kind: KIND_CPU
}
]
\ No newline at end of file
response_cache {
enable: true
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment