In [1]:
import numpy as np
import wonderwords
from tqdm import tqdm
from tritonclient.utils import *
from random import choice, randrange
import tritonclient.http as httpclient
from multiprocessing.pool import ThreadPool

In [2]:
shape = [1]
MIN_WORDS, MAX_WORDS = 4, 20
model_name = "nmt"
rs = wonderwords.RandomWord()

* Hit the tritonserver with a random sentence to a random model
* See https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/metrics.html for metrics

In [3]:
def task(x):
    lang_pair_map = list({'en-hi': 1, 'hi-en': 2, 'te-en': 4, 'hi-te': 6, 'te-hi': 7, 'en-gu': 8, 'gu-en': 9}.keys())
    with httpclient.InferenceServerClient("localhost:8000") as client:
        async_responses = []
        for i in range(10):
            s = ' '.join(rs.random_words(randrange(MIN_WORDS, MAX_WORDS)) + ['.']) # 'this is a sentence.' Use a constant sentence if you want to hit the cache
            source_data = np.array([[s]], dtype='object')
            inputs = [httpclient.InferInput("INPUT_TEXT", source_data.shape, np_to_triton_dtype(source_data.dtype)), httpclient.InferInput("INPUT_LANGUAGE_ID", source_data.shape, np_to_triton_dtype(source_data.dtype)), httpclient.InferInput("OUTPUT_LANGUAGE_ID", source_data.shape, np_to_triton_dtype(source_data.dtype))]
            inputs[0].set_data_from_numpy(np.array([[s]], dtype='object'))
            langpair = choice(lang_pair_map)
            inputs[1].set_data_from_numpy(np.array([[langpair.split('-')[0].strip()]], dtype='object'))
            inputs[2].set_data_from_numpy(np.array([[langpair.split('-')[1].strip()]], dtype='object'))
            outputs = [httpclient.InferRequestedOutput("OUTPUT_TEXT")]
            async_responses.append(client.async_infer(model_name, inputs, request_id=str(1), outputs=outputs))
        for r in async_responses: r.get_result(timeout=10).get_response()
    return 0

In [4]:
with ThreadPool(100) as pool: # float32 cpu load - 5.7 ram - 10.2 G gpu util - 100% vram - 4.7 G gpu wattage - 70 W
    for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass

100%|██████████| 1000/1000 [07:11<00:00,  2.32it/s]


In [4]:
with ThreadPool(100) as pool: # float16 cpu load - 5.2 ram - 10.3 G gpu util - 99% vram - 3.5 G gpu wattage - 65 W
    for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass

100%|██████████| 1000/1000 [05:09<00:00,  3.23it/s]


In [4]:
with ThreadPool(100) as pool: # int8_float16 cpu load - 5.7 ram - 10.3 G gpu util - 98% vram - 2.5 G gpu wattage - 61 W
    for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass

100%|██████████| 1000/1000 [05:20<00:00,  3.12it/s]


In [4]:
with ThreadPool(100) as pool: # int8 cpu load - 5.1 ram - 10.3 G gpu util - 97% vram - 2.5 G gpu wattage - 60 W
    for output in tqdm(pool.imap_unordered(task, range(1000), chunksize=1), total=1000): pass

100%|██████████| 1000/1000 [05:09<00:00,  3.23it/s]
