From 32d219e6ef66acbc487b65635799568c6bdc1bb9 Mon Sep 17 00:00:00 2001 From: Nikhilesh Bhatnagar Date: Tue, 1 Aug 2023 13:03:39 +0000 Subject: [PATCH] Quantization and inital performance benchmarking --- README.md | 6 ++-- make_triton_model_repo.sh | 18 +++++------ triton_client.ipynb | 64 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 71 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 313cbd5..c3996bd 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Prerequisites: `python3.xx-venv`, `nvidia-docker`, `bash` ```bash git clone https://ssmt.iiit.ac.in/meitygit/ssmt/mt-model-deploy-dhruva.git cd mt-model-deploy-dhruva -bash make_triton_model_repo.sh "https://ssmt.iiit.ac.in/uploads/data_mining/models.zip" "float16" +bash make_triton_model_repo.sh "https://ssmt.iiit.ac.in/uploads/data_mining/models.zip" "int8" docker build -t dhruva/ssmt-model-server:1 . nvidia-docker run --gpus=all --rm --shm-size 5g --network=host --name dhruva-ssmt-triton-server -v./ssmt_triton_repo:/models dhruva/ssmt-model-server:1 ``` @@ -18,7 +18,7 @@ nvidia-docker run --gpus=all --rm --shm-size 5g --network=host --name dhruva-ssm * This repo contains the templates and component triton models for the SSMT project. * Also contained is a Dockerfile to construct the triton server instance. -* Given a URL and quantization method (those supported by CTranslate2 i.e. `int8`, `int8_float16`, `int8_bfloat16`, `int16`, `float16` and `bfloat16`) it will download, quantize and construct the SSMT Triton Repository in `./ssmt_triton_repo` (disabled, will be enabled once testing is performed on representative hardware). +* Given a URL and quantization method (those supported by CTranslate2 i.e. `int8`, `int8_float16`, `int8_bfloat16`, `int16`, `float16` and `bfloat16`) it will download, quantize and construct the SSMT Triton Repository in `./ssmt_triton_repo` (int8 is the most efficient in size and speed on NVIDIA T4). * Dynamic batching and caching is supported and enabled by default. * The repository folder can me mounted to the dhruva ssmt triton server on `/models` and can be queried via a client. * Sample client code is also given as an ipython notebook. @@ -37,7 +37,7 @@ One can construct the triton repo like so: ```bash git clone https://ssmt.iiit.ac.in/meitygit/ssmt/mt-model-deploy-dhruva.git cd mt-model-deploy-dhruva -bash make_triton_model_repo.sh "https://ssmt.iiit.ac.in/uploads/data_mining/models.zip" "float16" +bash make_triton_model_repo.sh "https://ssmt.iiit.ac.in/uploads/data_mining/models.zip" "int8" ``` ## Starting the triton server diff --git a/make_triton_model_repo.sh b/make_triton_model_repo.sh index cbfff7b..5fd264e 100644 --- a/make_triton_model_repo.sh +++ b/make_triton_model_repo.sh @@ -7,14 +7,14 @@ python3 -m venv ./ssmt_ct2 source ./ssmt_ct2/bin/activate pip install ctranslate2 "OpenNMT-py==1.2.0" cd models -ct2-opennmt-py-converter --model_path 1.pt --output_dir ./1_ct2 -ct2-opennmt-py-converter --model_path 2.pt --output_dir ./2_ct2 -# ct2-opennmt-py-converter --model_path 3.pt --output_dir ./3_ct2 -ct2-opennmt-py-converter --model_path 4.pt --output_dir ./4_ct2 -ct2-opennmt-py-converter --model_path 6.pt --output_dir ./6_ct2 -ct2-opennmt-py-converter --model_path 7.pt --output_dir ./7_ct2 -ct2-opennmt-py-converter --model_path 8.pt --output_dir ./8_ct2 -ct2-opennmt-py-converter --model_path 9.pt --output_dir ./9_ct2 +ct2-opennmt-py-converter --model_path 1.pt --quantization $QUANTIZATION --output_dir ./1_ct2 +ct2-opennmt-py-converter --model_path 2.pt --quantization $QUANTIZATION --output_dir ./2_ct2 +# ct2-opennmt-py-converter --model_path 3.pt --quantization $QUANTIZATION --output_dir ./3_ct2 +ct2-opennmt-py-converter --model_path 4.pt --quantization $QUANTIZATION --output_dir ./4_ct2 +ct2-opennmt-py-converter --model_path 6.pt --quantization $QUANTIZATION --output_dir ./6_ct2 +ct2-opennmt-py-converter --model_path 7.pt --quantization $QUANTIZATION --output_dir ./7_ct2 +ct2-opennmt-py-converter --model_path 8.pt --quantization $QUANTIZATION --output_dir ./8_ct2 +ct2-opennmt-py-converter --model_path 9.pt --quantization $QUANTIZATION --output_dir ./9_ct2 cd .. mkdir ssmt_triton_repo cd ssmt_triton_repo @@ -47,7 +47,7 @@ cp -r ../triton_models/ssmt_template_model_repo ssmt_9_ct2 cp -r ../models/9_ct2 ssmt_9_ct2/1/translator sed -i 's/model_name/ssmt_9_ct2/' ssmt_9_ct2/config.pbtxt cd .. -source ./ssmt_ct2/bin/activate +deactivate rm -rf ssmt_ct2 rm -f models.zip rm -rf models diff --git a/triton_client.ipynb b/triton_client.ipynb index ff202ae..ed5e2a6 100644 --- a/triton_client.ipynb +++ b/triton_client.ipynb @@ -100,21 +100,75 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:06<00:00, 5.36it/s]\n" + "100%|██████████| 1000/1000 [07:11<00:00, 2.32it/s]\n" ] } ], "source": [ - "with ThreadPool(100) as pool:\n", + "with ThreadPool(100) as pool: # float32 cpu load - 5.7 ram - 10.2 G gpu util - 100% vram - 4.7 G gpu wattage - 70 W\n", + " for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1000/1000 [05:09<00:00, 3.23it/s]\n" + ] + } + ], + "source": [ + "with ThreadPool(100) as pool: # float16 cpu load - 5.2 ram - 10.3 G gpu util - 99% vram - 3.5 G gpu wattage - 65 W\n", + " for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1000/1000 [05:20<00:00, 3.12it/s]\n" + ] + } + ], + "source": [ + "with ThreadPool(100) as pool: # int8_float16 cpu load - 5.7 ram - 10.3 G gpu util - 98% vram - 2.5 G gpu wattage - 61 W\n", + " for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1000/1000 [05:09<00:00, 3.23it/s]\n" + ] + } + ], + "source": [ + "with ThreadPool(100) as pool: # int8 cpu load - 5.1 ram - 10.3 G gpu util - 97% vram - 2.5 G gpu wattage - 60 W\n", " for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass" ] } ], "metadata": { "kernelspec": { - "display_name": "mt-model-deploy-dhruva", + "display_name": ".venv", "language": "python", - "name": "mt-model-deploy-dhruva" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -126,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.4" } }, "nbformat": 4, -- GitLab