Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
M
mt-model-deploy-dhruva
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Packages & Registries
Packages & Registries
Package Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ssmt
mt-model-deploy-dhruva
Commits
bfd804f7
Commit
bfd804f7
authored
Aug 17, 2023
by
Nikhilesh Bhatnagar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
dynamic model loading prototype
parent
734aca61
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
104 additions
and
10 deletions
+104
-10
Dockerfile
Dockerfile
+1
-1
triton_models/ssmt_template_model_repo/1/model.py
triton_models/ssmt_template_model_repo/1/model.py
+103
-9
No files found.
Dockerfile
View file @
bfd804f7
...
...
@@ -4,6 +4,6 @@ RUN apt-get update && apt-get install -y python3.10-venv
ENV
VIRTUAL_ENV=/opt/dhruva-mt
RUN
python3
-m
venv
$VIRTUAL_ENV
ENV
PATH="$VIRTUAL_ENV/bin:$PATH"
RUN
pip
install
-U
ctranslate2 OpenNMT-py
==
1.2.0 git+https://github.com/vmujadia/tokenizer.git
RUN
pip
install
-U
ctranslate2 OpenNMT-py
==
1.2.0 git+https://github.com/vmujadia/tokenizer.git
tenacity
CMD
["tritonserver", "--model-repository=/models", "--cache-config=local,size=1048576"]
EXPOSE
8000
triton_models/ssmt_template_model_repo/1/model.py
View file @
bfd804f7
import
os
import
json
import
numpy
from
time
import
time
from
itertools
import
islice
from
threading
import
Lock
,
Timer
from
ctranslate2
import
Translator
import
triton_python_backend_utils
as
pb_utils
from
tenacity
import
retry
,
wait_random_exponential
class
DynamicModel
(
object
):
def
__init__
(
self
,
path
,
device
,
device_index
=
None
,
timeout
=
5
,
timer_min_delta
=
0.01
):
self
.
model
,
self
.
model_path
,
self
.
model_device
,
self
.
model_device_index
=
(
None
,
path
,
device
,
device_index
,
)
self
.
model_lock
,
self
.
timer_lock
=
Lock
(),
Lock
()
self
.
timeout
,
self
.
timer_min_delta
=
timeout
,
timer_min_delta
self
.
initialize
()
@
retry
(
wait
=
wait_random_exponential
(
multiplier
=
0.5
,
max
=
10
,
exp_base
=
1.2
))
def
initialize
(
self
):
self
.
model
=
Translator
(
self
.
model_path
,
device
=
self
.
model_device
,
intra_threads
=
1
,
inter_threads
=
1
,
device_index
=
self
.
model_device_index
,
)
self
.
timer
=
Timer
(
1
,
self
.
unload
)
self
.
timer
.
start_time
=
time
()
self
.
timer
.
start
()
def
restart_timer
(
self
):
with
self
.
timer_lock
:
if
time
()
-
self
.
timer
.
start_time
>=
self
.
timer_min_delta
:
self
.
timer
.
cancel
()
self
.
timer
=
Timer
(
self
.
timeout
,
self
.
unload
)
self
.
timer
.
start_time
=
time
()
self
.
timer
.
start
()
@
retry
(
wait
=
wait_random_exponential
(
multiplier
=
0.5
,
max
=
20
,
exp_base
=
1.2
))
def
load
(
self
,
reset_timer
=
True
):
with
self
.
timer_lock
:
self
.
timer
.
cancel
()
with
self
.
model_lock
:
self
.
model
.
load_model
()
if
reset_timer
:
self
.
restart_timer
()
def
unload
(
self
):
with
self
.
model_lock
:
self
.
model
.
unload_model
()
@
retry
(
wait
=
wait_random_exponential
(
multiplier
=
0.5
,
max
=
20
,
exp_base
=
1.2
))
def
translate
(
self
,
*
args
,
**
kwargs
):
if
not
self
.
model
.
model_is_loaded
:
self
.
load
(
reset_timer
=
False
)
results
=
list
(
self
.
model
.
translate_iterable
(
*
args
,
**
kwargs
))
self
.
restart_timer
()
return
results
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
current_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
self
.
model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
device_id
=
int
(
json
.
loads
(
args
[
'model_instance_device_id'
]))
target_config
=
pb_utils
.
get_output_config_by_name
(
self
.
model_config
,
"OUTPUT_TEXT"
)
self
.
device_id
=
int
(
json
.
loads
(
args
[
"model_instance_device_id"
]))
target_config
=
pb_utils
.
get_output_config_by_name
(
self
.
model_config
,
"OUTPUT_TEXT"
)
self
.
target_dtype
=
pb_utils
.
triton_string_to_numpy
(
target_config
[
"data_type"
])
try
:
self
.
translator
=
Translator
(
f"
{
os
.
path
.
join
(
current_path
,
'translator'
)
}
"
,
device
=
"cuda"
,
intra_threads
=
1
,
inter_threads
=
1
,
device_index
=
[
self
.
device_id
])
except
:
self
.
translator
=
Translator
(
f"
{
os
.
path
.
join
(
current_path
,
'translator'
)
}
"
,
device
=
"cpu"
,
intra_threads
=
4
)
self
.
translator
=
DynamicModel
(
f"
{
os
.
path
.
join
(
current_path
,
'translator'
)
}
"
,
device
=
"cuda"
,
device_index
=
[
self
.
device_id
],
)
def
execute
(
self
,
requests
):
source_list
=
[
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_TEXT_TOKENIZED"
)
for
request
in
requests
]
source_list
=
[
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_TEXT_TOKENIZED"
)
for
request
in
requests
]
bsize_list
=
[
source
.
as_numpy
().
shape
[
0
]
for
source
in
source_list
]
src_sentences
=
[
s
[
0
].
decode
(
'utf-8'
).
strip
().
split
(
' '
)
for
source
in
source_list
for
s
in
source
.
as_numpy
()]
tgt_sentences
=
[
' '
.
join
(
result
.
hypotheses
[
0
]).
replace
(
'@@ '
,
''
)
for
result
in
self
.
translator
.
translate_iterable
(
src_sentences
,
max_batch_size
=
128
,
max_input_length
=
100
,
max_decoding_length
=
100
)]
responses
=
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"OUTPUT_TEXT"
,
numpy
.
array
([[
s
]
for
s
in
islice
(
tgt_sentences
,
bsize
)],
dtype
=
'object'
).
astype
(
self
.
target_dtype
))])
for
bsize
in
bsize_list
]
src_sentences
=
[
s
[
0
].
decode
(
"utf-8"
).
strip
().
split
(
" "
)
for
source
in
source_list
for
s
in
source
.
as_numpy
()
]
tgt_sentences
=
[
" "
.
join
(
result
.
hypotheses
[
0
]).
replace
(
"@@ "
,
""
)
for
result
in
self
.
translator
.
translate
(
src_sentences
,
max_batch_size
=
128
,
max_input_length
=
100
,
max_decoding_length
=
100
,
)
]
responses
=
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"OUTPUT_TEXT"
,
numpy
.
array
(
[[
s
]
for
s
in
islice
(
tgt_sentences
,
bsize
)],
dtype
=
"object"
).
astype
(
self
.
target_dtype
),
)
]
)
for
bsize
in
bsize_list
]
return
responses
def
finalize
(
self
):
self
.
translator
.
unload_model
()
def
finalize
(
self
):
self
.
translator
.
unload
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment