Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
M
mt-model-deploy-dhruva
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Packages & Registries
Packages & Registries
Package Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ssmt
mt-model-deploy-dhruva
Commits
d0049da2
Commit
d0049da2
authored
Sep 04, 2023
by
Nikhilesh Bhatnagar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Formatting pass.
parent
f61cdc30
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
436 additions
and
112 deletions
+436
-112
.gitignore
.gitignore
+0
-1
triton_models/demuxer/1/model.py
triton_models/demuxer/1/model.py
+63
-3
triton_models/demuxer/config.pbtxt
triton_models/demuxer/config.pbtxt
+1
-1
triton_models/model_ct2/1/model.py
triton_models/model_ct2/1/model.py
+61
-13
triton_models/model_ct2/config.pbtxt
triton_models/model_ct2/config.pbtxt
+1
-1
triton_models/model_onmt/1/model.py
triton_models/model_onmt/1/model.py
+114
-13
triton_models/model_onmt/config.pbtxt
triton_models/model_onmt/config.pbtxt
+1
-1
triton_models/nmt/config.pbtxt
triton_models/nmt/config.pbtxt
+1
-1
triton_models/tokenizer/1/apply_bpe.py
triton_models/tokenizer/1/apply_bpe.py
+125
-74
triton_models/tokenizer/1/model.py
triton_models/tokenizer/1/model.py
+69
-4
No files found.
.gitignore
View file @
d0049da2
ssmt_triton_repo
himangy_triton_repo
himangy_triton_repo
\ No newline at end of file
triton_models/demuxer/1/model.py
View file @
d0049da2
...
@@ -3,7 +3,67 @@ import numpy
...
@@ -3,7 +3,67 @@ import numpy
import
asyncio
import
asyncio
import
triton_python_backend_utils
as
pb_utils
import
triton_python_backend_utils
as
pb_utils
class
TritonPythonModel
:
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
self
.
target_dtype
=
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
json
.
loads
(
args
[
'model_config'
]),
'OUTPUT_TEXT'
)[
'data_type'
])
def
initialize
(
self
,
args
):
async
def
execute
(
self
,
requests
):
return
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
'OUTPUT_TEXT'
,
numpy
.
array
([[
pb_utils
.
get_output_tensor_by_name
(
result
,
'OUTPUT_SENT'
).
as_numpy
()[
0
,
0
].
decode
(
'utf-8'
)]
for
result
in
(
await
asyncio
.
gather
(
*
awaits
))],
dtype
=
self
.
target_dtype
))])
for
awaits
in
[[
pb_utils
.
InferenceRequest
(
model_name
=
f"himangy-
{
input_language_id
[
0
].
decode
(
'utf-8'
)
}
-
{
output_language_id
[
0
].
decode
(
'utf-8'
)
}
"
,
requested_output_names
=
[
'OUTPUT_SENT'
],
inputs
=
[
pb_utils
.
Tensor
(
'INPUT_SENT_TOKENIZED'
,
numpy
.
array
([[
input_text_tokenized
[
0
].
decode
(
'utf-8'
)]],
dtype
=
'object'
))]).
async_exec
()
for
input_text_tokenized
,
input_language_id
,
output_language_id
in
zip
(
pb_utils
.
get_input_tensor_by_name
(
request
,
'INPUT_TEXT_TOKENIZED'
).
as_numpy
(),
pb_utils
.
get_input_tensor_by_name
(
request
,
'INPUT_LANGUAGE_ID'
).
as_numpy
(),
pb_utils
.
get_input_tensor_by_name
(
request
,
'OUTPUT_LANGUAGE_ID'
).
as_numpy
())]
for
request
in
requests
]]
self
.
target_dtype
=
pb_utils
.
triton_string_to_numpy
(
def
finalize
(
self
):
pass
pb_utils
.
get_output_config_by_name
(
\ No newline at end of file
json
.
loads
(
args
[
"model_config"
]),
"OUTPUT_TEXT"
)[
"data_type"
]
)
async
def
execute
(
self
,
requests
):
return
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"OUTPUT_TEXT"
,
numpy
.
array
(
[
[
pb_utils
.
get_output_tensor_by_name
(
result
,
"OUTPUT_SENT"
)
.
as_numpy
()[
0
,
0
]
.
decode
(
"utf-8"
)
]
for
result
in
(
await
asyncio
.
gather
(
*
awaits
))
],
dtype
=
self
.
target_dtype
,
),
)
]
)
for
awaits
in
[
[
pb_utils
.
InferenceRequest
(
model_name
=
f"himangy-
{
input_language_id
[
0
].
decode
(
'utf-8'
)
}
-
{
output_language_id
[
0
].
decode
(
'utf-8'
)
}
"
,
requested_output_names
=
[
"OUTPUT_SENT"
],
inputs
=
[
pb_utils
.
Tensor
(
"INPUT_SENT_TOKENIZED"
,
numpy
.
array
(
[[
input_text_tokenized
[
0
].
decode
(
"utf-8"
)]],
dtype
=
"object"
,
),
)
],
).
async_exec
()
for
input_text_tokenized
,
input_language_id
,
output_language_id
in
zip
(
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_TEXT_TOKENIZED"
).
as_numpy
(),
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_LANGUAGE_ID"
).
as_numpy
(),
pb_utils
.
get_input_tensor_by_name
(
request
,
"OUTPUT_LANGUAGE_ID"
).
as_numpy
(),
)
]
for
request
in
requests
]
]
def
finalize
(
self
):
pass
triton_models/demuxer/config.pbtxt
View file @
d0049da2
triton_models/model_ct2/1/model.py
View file @
d0049da2
...
@@ -5,27 +5,75 @@ from itertools import islice
...
@@ -5,27 +5,75 @@ from itertools import islice
from
ctranslate2
import
Translator
from
ctranslate2
import
Translator
import
triton_python_backend_utils
as
pb_utils
import
triton_python_backend_utils
as
pb_utils
class
TritonPythonModel
:
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
def
initialize
(
self
,
args
):
current_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
current_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
self
.
source_lang
,
self
.
target_lang
=
input_lang
,
output_lang
self
.
source_lang
,
self
.
target_lang
=
input_lang
,
output_lang
self
.
model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
device_id
=
int
(
json
.
loads
(
args
[
'model_instance_device_id'
]))
self
.
device_id
=
int
(
json
.
loads
(
args
[
"model_instance_device_id"
]))
target_config
=
pb_utils
.
get_output_config_by_name
(
self
.
model_config
,
"OUTPUT_SENT"
)
target_config
=
pb_utils
.
get_output_config_by_name
(
self
.
model_config
,
"OUTPUT_SENT"
)
self
.
target_dtype
=
pb_utils
.
triton_string_to_numpy
(
target_config
[
"data_type"
])
self
.
target_dtype
=
pb_utils
.
triton_string_to_numpy
(
target_config
[
"data_type"
])
try
:
self
.
translator
=
Translator
(
f"
{
os
.
path
.
join
(
current_path
,
'translator'
)
}
"
,
device
=
"cuda"
,
intra_threads
=
1
,
inter_threads
=
1
,
device_index
=
[
self
.
device_id
])
try
:
except
:
self
.
translator
=
Translator
(
f"
{
os
.
path
.
join
(
current_path
,
'translator'
)
}
"
,
device
=
"cpu"
,
intra_threads
=
4
)
self
.
translator
=
Translator
(
f"
{
os
.
path
.
join
(
current_path
,
'translator'
)
}
"
,
device
=
"cuda"
,
intra_threads
=
1
,
inter_threads
=
1
,
device_index
=
[
self
.
device_id
],
)
except
:
self
.
translator
=
Translator
(
f"
{
os
.
path
.
join
(
current_path
,
'translator'
)
}
"
,
device
=
"cpu"
,
intra_threads
=
4
,
)
def
clean_output
(
self
,
text
):
def
clean_output
(
self
,
text
):
text
=
text
.
replace
(
'@@ '
,
''
)
text
=
text
.
replace
(
"@@ "
,
""
)
text
=
text
.
replace
(
'
\u200c
'
,
''
)
text
=
text
.
replace
(
"
\u200c
"
,
""
)
if
text
.
startswith
(
'<to-gu> '
):
text
=
text
[
8
:]
if
text
.
startswith
(
"<to-gu> "
):
if
text
.
endswith
(
' <to-gu>'
):
text
=
text
[:
-
8
]
text
=
text
[
8
:]
if
text
.
endswith
(
" <to-gu>"
):
text
=
text
[:
-
8
]
return
text
return
text
def
execute
(
self
,
requests
):
def
execute
(
self
,
requests
):
source_list
=
[
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_SENT_TOKENIZED"
)
for
request
in
requests
]
source_list
=
[
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_SENT_TOKENIZED"
)
for
request
in
requests
]
bsize_list
=
[
source
.
as_numpy
().
shape
[
0
]
for
source
in
source_list
]
bsize_list
=
[
source
.
as_numpy
().
shape
[
0
]
for
source
in
source_list
]
src_sentences
=
[
s
[
0
].
decode
(
'utf-8'
).
strip
().
split
(
' '
)
for
source
in
source_list
for
s
in
source
.
as_numpy
()]
src_sentences
=
[
tgt_sentences
=
[
self
.
clean_output
(
' '
.
join
(
result
.
hypotheses
[
0
]))
for
result
in
self
.
translator
.
translate_iterable
(
src_sentences
,
max_batch_size
=
128
,
max_input_length
=
100
,
max_decoding_length
=
100
)]
s
[
0
].
decode
(
"utf-8"
).
strip
().
split
(
" "
)
responses
=
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"OUTPUT_SENT"
,
numpy
.
array
([[
s
]
for
s
in
islice
(
tgt_sentences
,
bsize
)],
dtype
=
'object'
).
astype
(
self
.
target_dtype
))])
for
bsize
in
bsize_list
]
for
source
in
source_list
for
s
in
source
.
as_numpy
()
]
tgt_sentences
=
[
self
.
clean_output
(
" "
.
join
(
result
.
hypotheses
[
0
]))
for
result
in
self
.
translator
.
translate_iterable
(
src_sentences
,
max_batch_size
=
128
,
max_input_length
=
100
,
max_decoding_length
=
100
,
)
]
responses
=
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"OUTPUT_SENT"
,
numpy
.
array
(
[[
s
]
for
s
in
islice
(
tgt_sentences
,
bsize
)],
dtype
=
"object"
).
astype
(
self
.
target_dtype
),
)
]
)
for
bsize
in
bsize_list
]
return
responses
return
responses
def
finalize
(
self
):
self
.
translator
.
unload_model
()
\ No newline at end of file
def
finalize
(
self
):
self
.
translator
.
unload_model
()
triton_models/model_ct2/config.pbtxt
View file @
d0049da2
triton_models/model_onmt/1/model.py
View file @
d0049da2
...
@@ -6,27 +6,128 @@ from argparse import Namespace
...
@@ -6,27 +6,128 @@ from argparse import Namespace
import
triton_python_backend_utils
as
pb_utils
import
triton_python_backend_utils
as
pb_utils
from
onmt.translate.translator
import
build_translator
from
onmt.translate.translator
import
build_translator
class
TritonPythonModel
:
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
def
initialize
(
self
,
args
):
current_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
current_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
self
.
source_lang
,
self
.
target_lang
=
input_lang
,
output_lang
self
.
source_lang
,
self
.
target_lang
=
input_lang
,
output_lang
self
.
model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
model_config
=
json
.
loads
(
args
[
"model_config"
])
self
.
device_id
=
int
(
json
.
loads
(
args
[
'model_instance_device_id'
]))
self
.
device_id
=
int
(
json
.
loads
(
args
[
"model_instance_device_id"
]))
target_config
=
pb_utils
.
get_output_config_by_name
(
self
.
model_config
,
"OUTPUT_SENT"
)
target_config
=
pb_utils
.
get_output_config_by_name
(
self
.
model_config
,
"OUTPUT_SENT"
)
self
.
target_dtype
=
pb_utils
.
triton_string_to_numpy
(
target_config
[
"data_type"
])
self
.
target_dtype
=
pb_utils
.
triton_string_to_numpy
(
target_config
[
"data_type"
])
try
:
self
.
translator
=
build_translator
(
Namespace
(
tgt_prefix
=
False
,
alpha
=
0.0
,
batch_type
=
'sents'
,
beam_size
=
5
,
beta
=-
0.0
,
block_ngram_repeat
=
0
,
coverage_penalty
=
'none'
,
data_type
=
'text'
,
dump_beam
=
''
,
fp32
=
True
,
gpu
=
self
.
device_id
,
ignore_when_blocking
=
[],
length_penalty
=
'none'
,
max_length
=
100
,
max_sent_length
=
None
,
min_length
=
0
,
models
=
[
f"
{
os
.
path
.
join
(
current_path
,
'translator.pt'
)
}
"
],
n_best
=
1
,
output
=
'/dev/null'
,
phrase_table
=
''
,
random_sampling_temp
=
1.0
,
random_sampling_topk
=
1
,
ratio
=-
0.0
,
replace_unk
=
False
,
report_align
=
False
,
report_time
=
False
,
seed
=
829
,
stepwise_penalty
=
False
,
tgt
=
None
,
verbose
=
False
),
report_score
=
False
)
try
:
except
:
self
.
translator
=
build_translator
(
Namespace
(
tgt_prefix
=
False
,
alpha
=
0.0
,
batch_type
=
'sents'
,
beam_size
=
5
,
beta
=-
0.0
,
block_ngram_repeat
=
0
,
coverage_penalty
=
'none'
,
data_type
=
'text'
,
dump_beam
=
''
,
fp32
=
True
,
gpu
=-
1
,
ignore_when_blocking
=
[],
length_penalty
=
'none'
,
max_length
=
100
,
max_sent_length
=
None
,
min_length
=
0
,
models
=
[
f"
{
os
.
path
.
join
(
current_path
,
'translator.pt'
)
}
"
],
n_best
=
1
,
output
=
'/dev/null'
,
phrase_table
=
''
,
random_sampling_temp
=
1.0
,
random_sampling_topk
=
1
,
ratio
=-
0.0
,
replace_unk
=
False
,
report_align
=
False
,
report_time
=
False
,
seed
=
829
,
stepwise_penalty
=
False
,
tgt
=
None
,
verbose
=
False
),
report_score
=
False
)
self
.
translator
=
build_translator
(
Namespace
(
tgt_prefix
=
False
,
alpha
=
0.0
,
batch_type
=
"sents"
,
beam_size
=
5
,
beta
=-
0.0
,
block_ngram_repeat
=
0
,
coverage_penalty
=
"none"
,
data_type
=
"text"
,
dump_beam
=
""
,
fp32
=
True
,
gpu
=
self
.
device_id
,
ignore_when_blocking
=
[],
length_penalty
=
"none"
,
max_length
=
100
,
max_sent_length
=
None
,
min_length
=
0
,
models
=
[
f"
{
os
.
path
.
join
(
current_path
,
'translator.pt'
)
}
"
],
n_best
=
1
,
output
=
"/dev/null"
,
phrase_table
=
""
,
random_sampling_temp
=
1.0
,
random_sampling_topk
=
1
,
ratio
=-
0.0
,
replace_unk
=
False
,
report_align
=
False
,
report_time
=
False
,
seed
=
829
,
stepwise_penalty
=
False
,
tgt
=
None
,
verbose
=
False
,
),
report_score
=
False
,
)
except
:
self
.
translator
=
build_translator
(
Namespace
(
tgt_prefix
=
False
,
alpha
=
0.0
,
batch_type
=
"sents"
,
beam_size
=
5
,
beta
=-
0.0
,
block_ngram_repeat
=
0
,
coverage_penalty
=
"none"
,
data_type
=
"text"
,
dump_beam
=
""
,
fp32
=
True
,
gpu
=-
1
,
ignore_when_blocking
=
[],
length_penalty
=
"none"
,
max_length
=
100
,
max_sent_length
=
None
,
min_length
=
0
,
models
=
[
f"
{
os
.
path
.
join
(
current_path
,
'translator.pt'
)
}
"
],
n_best
=
1
,
output
=
"/dev/null"
,
phrase_table
=
""
,
random_sampling_temp
=
1.0
,
random_sampling_topk
=
1
,
ratio
=-
0.0
,
replace_unk
=
False
,
report_align
=
False
,
report_time
=
False
,
seed
=
829
,
stepwise_penalty
=
False
,
tgt
=
None
,
verbose
=
False
,
),
report_score
=
False
,
)
def
clean_output
(
self
,
text
):
def
clean_output
(
self
,
text
):
text
=
text
.
replace
(
'@@ '
,
''
)
text
=
text
.
replace
(
"@@ "
,
""
)
text
=
text
.
replace
(
'
\u200c
'
,
''
)
text
=
text
.
replace
(
"
\u200c
"
,
""
)
if
text
.
startswith
(
'<to-gu> '
):
text
=
text
[
8
:]
if
text
.
startswith
(
"<to-gu> "
):
if
text
.
endswith
(
' <to-gu>'
):
text
=
text
[:
-
8
]
text
=
text
[
8
:]
if
text
.
endswith
(
" <to-gu>"
):
text
=
text
[:
-
8
]
return
text
return
text
def
execute
(
self
,
requests
):
def
execute
(
self
,
requests
):
source_list
=
[
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_SENT_TOKENIZED"
)
for
request
in
requests
]
source_list
=
[
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_SENT_TOKENIZED"
)
for
request
in
requests
]
bsize_list
=
[
source
.
as_numpy
().
shape
[
0
]
for
source
in
source_list
]
bsize_list
=
[
source
.
as_numpy
().
shape
[
0
]
for
source
in
source_list
]
src_sentences
=
[
s
[
0
].
decode
(
'utf-8'
).
strip
().
split
(
' '
)
for
source
in
source_list
for
s
in
source
.
as_numpy
()]
src_sentences
=
[
tgt_sentences
=
[
self
.
clean_output
(
result
[
0
])
for
result
in
self
.
translator
.
translate
(
src_sentences
,
batch_size
=
128
)[
1
]]
s
[
0
].
decode
(
"utf-8"
).
strip
().
split
(
" "
)
responses
=
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"OUTPUT_SENT"
,
numpy
.
array
([[
s
]
for
s
in
islice
(
tgt_sentences
,
bsize
)],
dtype
=
'object'
).
astype
(
self
.
target_dtype
))])
for
bsize
in
bsize_list
]
for
source
in
source_list
for
s
in
source
.
as_numpy
()
]
tgt_sentences
=
[
self
.
clean_output
(
result
[
0
])
for
result
in
self
.
translator
.
translate
(
src_sentences
,
batch_size
=
128
)[
1
]
]
responses
=
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"OUTPUT_SENT"
,
numpy
.
array
(
[[
s
]
for
s
in
islice
(
tgt_sentences
,
bsize
)],
dtype
=
"object"
).
astype
(
self
.
target_dtype
),
)
]
)
for
bsize
in
bsize_list
]
return
responses
return
responses
def
finalize
(
self
):
del
self
.
translator
\ No newline at end of file
def
finalize
(
self
):
del
self
.
translator
triton_models/model_onmt/config.pbtxt
View file @
d0049da2
triton_models/nmt/config.pbtxt
View file @
d0049da2
triton_models/tokenizer/1/apply_bpe.py
View file @
d0049da2
...
@@ -25,18 +25,21 @@ from collections import defaultdict
...
@@ -25,18 +25,21 @@ from collections import defaultdict
# hack for python2/3 compatibility
# hack for python2/3 compatibility
from
io
import
open
from
io
import
open
argparse
.
open
=
open
argparse
.
open
=
open
class
BPE
(
object
):
class
BPE
(
object
):
def
__init__
(
self
,
codes
,
separator
=
"@@"
,
vocab
=
None
,
glossaries
=
None
):
def
__init__
(
self
,
codes
,
separator
=
'@@'
,
vocab
=
None
,
glossaries
=
None
):
# check version information
# check version information
firstline
=
codes
.
readline
()
firstline
=
codes
.
readline
()
if
firstline
.
startswith
(
'#version:'
):
if
firstline
.
startswith
(
"#version:"
):
self
.
version
=
tuple
([
int
(
x
)
for
x
in
re
.
sub
(
self
.
version
=
tuple
(
r'(\.0+)*$'
,
''
,
firstline
.
split
()[
-
1
]).
split
(
"."
)])
[
int
(
x
)
for
x
in
re
.
sub
(
r"(\.0+)*$"
,
""
,
firstline
.
split
()[
-
1
]).
split
(
"."
)
]
)
else
:
else
:
self
.
version
=
(
0
,
1
)
self
.
version
=
(
0
,
1
)
codes
.
seek
(
0
)
codes
.
seek
(
0
)
...
@@ -45,10 +48,12 @@ class BPE(object):
...
@@ -45,10 +48,12 @@ class BPE(object):
# some hacking to deal with duplicates (only consider first instance)
# some hacking to deal with duplicates (only consider first instance)
self
.
bpe_codes
=
dict
(
self
.
bpe_codes
=
dict
(
[(
code
,
i
)
for
(
i
,
code
)
in
reversed
(
list
(
enumerate
(
self
.
bpe_codes
)))])
[(
code
,
i
)
for
(
i
,
code
)
in
reversed
(
list
(
enumerate
(
self
.
bpe_codes
)))]
)
self
.
bpe_codes_reverse
=
dict
(
self
.
bpe_codes_reverse
=
dict
(
[(
pair
[
0
]
+
pair
[
1
],
pair
)
for
pair
,
i
in
self
.
bpe_codes
.
items
()])
[(
pair
[
0
]
+
pair
[
1
],
pair
)
for
pair
,
i
in
self
.
bpe_codes
.
items
()]
)
self
.
separator
=
separator
self
.
separator
=
separator
...
@@ -62,63 +67,99 @@ class BPE(object):
...
@@ -62,63 +67,99 @@ class BPE(object):
"""segment single sentence (whitespace-tokenized string) with BPE encoding"""
"""segment single sentence (whitespace-tokenized string) with BPE encoding"""
output
=
[]
output
=
[]
for
word
in
sentence
.
split
():
for
word
in
sentence
.
split
():
new_word
=
[
out
for
segment
in
self
.
_isolate_glossaries
(
word
)
new_word
=
[
for
out
in
encode
(
segment
,
out
for
segment
in
self
.
_isolate_glossaries
(
word
)
for
out
in
encode
(
segment
,
self
.
bpe_codes
,
self
.
bpe_codes
,
self
.
bpe_codes_reverse
,
self
.
bpe_codes_reverse
,
self
.
vocab
,
self
.
vocab
,
self
.
separator
,
self
.
separator
,
self
.
version
,
self
.
version
,
self
.
cache
,
self
.
cache
,
self
.
glossaries
)]
self
.
glossaries
,
)
]
for
item
in
new_word
[:
-
1
]:
for
item
in
new_word
[:
-
1
]:
output
.
append
(
item
+
self
.
separator
)
output
.
append
(
item
+
self
.
separator
)
output
.
append
(
new_word
[
-
1
])
output
.
append
(
new_word
[
-
1
])
return
' '
.
join
(
output
)
return
" "
.
join
(
output
)
def
_isolate_glossaries
(
self
,
word
):
def
_isolate_glossaries
(
self
,
word
):
word_segments
=
[
word
]
word_segments
=
[
word
]
for
gloss
in
self
.
glossaries
:
for
gloss
in
self
.
glossaries
:
word_segments
=
[
out_segments
for
segment
in
word_segments
word_segments
=
[
for
out_segments
in
isolate_glossary
(
segment
,
gloss
)]
out_segments
for
segment
in
word_segments
for
out_segments
in
isolate_glossary
(
segment
,
gloss
)
]
return
word_segments
return
word_segments
def
create_parser
():
def
create_parser
():
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
description
=
"learn BPE-based word segmentation"
)
description
=
"learn BPE-based word segmentation"
,
)
parser
.
add_argument
(
parser
.
add_argument
(
'--input'
,
'-i'
,
type
=
argparse
.
FileType
(
'r'
),
default
=
sys
.
stdin
,
"--input"
,
metavar
=
'PATH'
,
"-i"
,
help
=
"Input file (default: standard input)."
)
type
=
argparse
.
FileType
(
"r"
),
default
=
sys
.
stdin
,
metavar
=
"PATH"
,
help
=
"Input file (default: standard input)."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
'--codes'
,
'-c'
,
type
=
argparse
.
FileType
(
'r'
),
metavar
=
'PATH'
,
"--codes"
,
"-c"
,
type
=
argparse
.
FileType
(
"r"
),
metavar
=
"PATH"
,
required
=
True
,
required
=
True
,
help
=
"File with BPE codes (created by learn_bpe.py)."
)
help
=
"File with BPE codes (created by learn_bpe.py)."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
'--output'
,
'-o'
,
type
=
argparse
.
FileType
(
'w'
),
default
=
sys
.
stdout
,
"--output"
,
metavar
=
'PATH'
,
"-o"
,
help
=
"Output file (default: standard output)"
)
type
=
argparse
.
FileType
(
"w"
),
default
=
sys
.
stdout
,
metavar
=
"PATH"
,
help
=
"Output file (default: standard output)"
,
)
parser
.
add_argument
(
parser
.
add_argument
(
'--separator'
,
'-s'
,
type
=
str
,
default
=
'@@'
,
metavar
=
'STR'
,
"--separator"
,
help
=
"Separator between non-final subword units (default: '%(default)s'))"
)
"-s"
,
type
=
str
,
default
=
"@@"
,
metavar
=
"STR"
,
help
=
"Separator between non-final subword units (default: '%(default)s'))"
,
)
parser
.
add_argument
(
parser
.
add_argument
(
'--vocabulary'
,
type
=
argparse
.
FileType
(
'r'
),
default
=
None
,
"--vocabulary"
,
type
=
argparse
.
FileType
(
"r"
),
default
=
None
,
metavar
=
"PATH"
,
metavar
=
"PATH"
,
help
=
"Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV."
)
help
=
"Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
'--vocabulary-threshold'
,
type
=
int
,
default
=
None
,
"--vocabulary-threshold"
,
type
=
int
,
default
=
None
,
metavar
=
"INT"
,
metavar
=
"INT"
,
help
=
"Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV"
)
help
=
"Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV"
,
)
parser
.
add_argument
(
parser
.
add_argument
(
'--glossaries'
,
type
=
str
,
nargs
=
'+'
,
default
=
None
,
"--glossaries"
,
type
=
str
,
nargs
=
"+"
,
default
=
None
,
metavar
=
"STR"
,
metavar
=
"STR"
,
help
=
"Glossaries. The strings provided in glossaries will not be affected"
+
help
=
"Glossaries. The strings provided in glossaries will not be affected"
"by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords"
)
+
"by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords"
,
)
return
parser
return
parser
...
@@ -136,9 +177,17 @@ def get_pairs(word):
...
@@ -136,9 +177,17 @@ def get_pairs(word):
return
pairs
return
pairs
def
encode
(
orig
,
bpe_codes
,
bpe_codes_reverse
,
vocab
,
separator
,
version
,
cache
,
glossaries
=
None
):
def
encode
(
"""Encode word based on list of BPE merge operations, which are applied consecutively
orig
,
"""
bpe_codes
,
bpe_codes_reverse
,
vocab
,
separator
,
version
,
cache
,
glossaries
=
None
,
):
"""Encode word based on list of BPE merge operations, which are applied consecutively"""
if
orig
in
cache
:
if
orig
in
cache
:
return
cache
[
orig
]
return
cache
[
orig
]
...
@@ -148,9 +197,9 @@ def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache,
...
@@ -148,9 +197,9 @@ def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache,
return
(
orig
,)
return
(
orig
,)
if
version
==
(
0
,
1
):
if
version
==
(
0
,
1
):
word
=
tuple
(
orig
)
+
(
'</w>'
,)
word
=
tuple
(
orig
)
+
(
"</w>"
,)
elif
version
==
(
0
,
2
):
# more consistent handling of word-final segments
elif
version
==
(
0
,
2
):
# more consistent handling of word-final segments
word
=
tuple
(
orig
[:
-
1
])
+
(
orig
[
-
1
]
+
'</w>'
,)
word
=
tuple
(
orig
[:
-
1
])
+
(
orig
[
-
1
]
+
"</w>"
,)
else
:
else
:
raise
NotImplementedError
raise
NotImplementedError
...
@@ -160,7 +209,7 @@ def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache,
...
@@ -160,7 +209,7 @@ def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache,
return
orig
return
orig
while
True
:
while
True
:
bigram
=
min
(
pairs
,
key
=
lambda
pair
:
bpe_codes
.
get
(
pair
,
float
(
'inf'
)))
bigram
=
min
(
pairs
,
key
=
lambda
pair
:
bpe_codes
.
get
(
pair
,
float
(
"inf"
)))
if
bigram
not
in
bpe_codes
:
if
bigram
not
in
bpe_codes
:
break
break
first
,
second
=
bigram
first
,
second
=
bigram
...
@@ -189,10 +238,10 @@ def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache,
...
@@ -189,10 +238,10 @@ def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache,
pairs
=
get_pairs
(
word
)
pairs
=
get_pairs
(
word
)
# don't print end-of-word symbols
# don't print end-of-word symbols
if
word
[
-
1
]
==
'</w>'
:
if
word
[
-
1
]
==
"</w>"
:
word
=
word
[:
-
1
]
word
=
word
[:
-
1
]
elif
word
[
-
1
].
endswith
(
'</w>'
):
elif
word
[
-
1
].
endswith
(
"</w>"
):
word
=
word
[:
-
1
]
+
(
word
[
-
1
].
replace
(
'</w>'
,
''
),)
word
=
word
[:
-
1
]
+
(
word
[
-
1
].
replace
(
"</w>"
,
""
),)
if
vocab
:
if
vocab
:
word
=
check_vocab_and_split
(
word
,
bpe_codes_reverse
,
vocab
,
separator
)
word
=
check_vocab_and_split
(
word
,
bpe_codes_reverse
,
vocab
,
separator
)
...
@@ -207,12 +256,12 @@ def recursive_split(segment, bpe_codes, vocab, separator, final=False):
...
@@ -207,12 +256,12 @@ def recursive_split(segment, bpe_codes, vocab, separator, final=False):
try
:
try
:
if
final
:
if
final
:
left
,
right
=
bpe_codes
[
segment
+
'</w>'
]
left
,
right
=
bpe_codes
[
segment
+
"</w>"
]
right
=
right
[:
-
4
]
right
=
right
[:
-
4
]
else
:
else
:
left
,
right
=
bpe_codes
[
segment
]
left
,
right
=
bpe_codes
[
segment
]
except
:
except
:
#sys.stderr.write('cannot split {0} further.\n'.format(segment))
#
sys.stderr.write('cannot split {0} further.\n'.format(segment))
yield
segment
yield
segment
return
return
...
@@ -239,7 +288,7 @@ def check_vocab_and_split(orig, bpe_codes, vocab, separator):
...
@@ -239,7 +288,7 @@ def check_vocab_and_split(orig, bpe_codes, vocab, separator):
if
segment
+
separator
in
vocab
:
if
segment
+
separator
in
vocab
:
out
.
append
(
segment
)
out
.
append
(
segment
)
else
:
else
:
#sys.stderr.write('OOV: {0}\n'.format(segment))
#
sys.stderr.write('OOV: {0}\n'.format(segment))
for
item
in
recursive_split
(
segment
,
bpe_codes
,
vocab
,
separator
,
False
):
for
item
in
recursive_split
(
segment
,
bpe_codes
,
vocab
,
separator
,
False
):
out
.
append
(
item
)
out
.
append
(
item
)
...
@@ -247,7 +296,7 @@ def check_vocab_and_split(orig, bpe_codes, vocab, separator):
...
@@ -247,7 +296,7 @@ def check_vocab_and_split(orig, bpe_codes, vocab, separator):
if
segment
in
vocab
:
if
segment
in
vocab
:
out
.
append
(
segment
)
out
.
append
(
segment
)
else
:
else
:
#sys.stderr.write('OOV: {0}\n'.format(segment))
#
sys.stderr.write('OOV: {0}\n'.format(segment))
for
item
in
recursive_split
(
segment
,
bpe_codes
,
vocab
,
separator
,
True
):
for
item
in
recursive_split
(
segment
,
bpe_codes
,
vocab
,
separator
,
True
):
out
.
append
(
item
)
out
.
append
(
item
)
...
@@ -255,8 +304,7 @@ def check_vocab_and_split(orig, bpe_codes, vocab, separator):
...
@@ -255,8 +304,7 @@ def check_vocab_and_split(orig, bpe_codes, vocab, separator):
def
read_vocabulary
(
vocab_file
,
threshold
):
def
read_vocabulary
(
vocab_file
,
threshold
):
"""read vocabulary file produced by get_vocab.py, and filter according to frequency threshold.
"""read vocabulary file produced by get_vocab.py, and filter according to frequency threshold."""
"""
vocabulary
=
set
()
vocabulary
=
set
()
...
@@ -282,39 +330,42 @@ def isolate_glossary(word, glossary):
...
@@ -282,39 +330,42 @@ def isolate_glossary(word, glossary):
return
[
word
]
return
[
word
]
else
:
else
:
splits
=
word
.
split
(
glossary
)
splits
=
word
.
split
(
glossary
)
segments
=
[
segment
.
strip
()
for
split
in
splits
[:
-
1
]
segments
=
[
for
segment
in
[
split
,
glossary
]
if
segment
!=
''
]
segment
.
strip
()
return
segments
+
[
splits
[
-
1
].
strip
()]
if
splits
[
-
1
]
!=
''
else
segments
for
split
in
splits
[:
-
1
]
for
segment
in
[
split
,
glossary
]
if
segment
!=
""
]
return
segments
+
[
splits
[
-
1
].
strip
()]
if
splits
[
-
1
]
!=
""
else
segments
if
__name__
==
'__main__'
:
if
__name__
==
"__main__"
:
# python 2/3 compatibility
# python 2/3 compatibility
if
sys
.
version_info
<
(
3
,
0
):
if
sys
.
version_info
<
(
3
,
0
):
sys
.
stderr
=
codecs
.
getwriter
(
'UTF-8'
)(
sys
.
stderr
)
sys
.
stderr
=
codecs
.
getwriter
(
"UTF-8"
)(
sys
.
stderr
)
sys
.
stdout
=
codecs
.
getwriter
(
'UTF-8'
)(
sys
.
stdout
)
sys
.
stdout
=
codecs
.
getwriter
(
"UTF-8"
)(
sys
.
stdout
)
sys
.
stdin
=
codecs
.
getreader
(
'UTF-8'
)(
sys
.
stdin
)
sys
.
stdin
=
codecs
.
getreader
(
"UTF-8"
)(
sys
.
stdin
)
else
:
else
:
sys
.
stdin
=
io
.
TextIOWrapper
(
sys
.
stdin
.
buffer
,
encoding
=
'utf-8'
)
sys
.
stdin
=
io
.
TextIOWrapper
(
sys
.
stdin
.
buffer
,
encoding
=
"utf-8"
)
sys
.
stderr
=
io
.
TextIOWrapper
(
sys
.
stderr
.
buffer
,
encoding
=
'utf-8'
)
sys
.
stderr
=
io
.
TextIOWrapper
(
sys
.
stderr
.
buffer
,
encoding
=
"utf-8"
)
sys
.
stdout
=
io
.
TextIOWrapper
(
sys
.
stdout
=
io
.
TextIOWrapper
(
sys
.
stdout
.
buffer
,
encoding
=
'utf-8'
,
write_through
=
True
,
line_buffering
=
True
)
sys
.
stdout
.
buffer
,
encoding
=
"utf-8"
,
write_through
=
True
,
line_buffering
=
True
)
parser
=
create_parser
()
parser
=
create_parser
()
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# read/write files as UTF-8
# read/write files as UTF-8
args
.
codes
=
codecs
.
open
(
args
.
codes
.
name
,
encoding
=
'utf-8'
)
args
.
codes
=
codecs
.
open
(
args
.
codes
.
name
,
encoding
=
"utf-8"
)
if
args
.
input
.
name
!=
'<stdin>'
:
if
args
.
input
.
name
!=
"<stdin>"
:
args
.
input
=
codecs
.
open
(
args
.
input
.
name
,
encoding
=
'utf-8'
)
args
.
input
=
codecs
.
open
(
args
.
input
.
name
,
encoding
=
"utf-8"
)
if
args
.
output
.
name
!=
'<stdout>'
:
if
args
.
output
.
name
!=
"<stdout>"
:
args
.
output
=
codecs
.
open
(
args
.
output
.
name
,
'w'
,
encoding
=
'utf-8'
)
args
.
output
=
codecs
.
open
(
args
.
output
.
name
,
"w"
,
encoding
=
"utf-8"
)
if
args
.
vocabulary
:
if
args
.
vocabulary
:
args
.
vocabulary
=
codecs
.
open
(
args
.
vocabulary
.
name
,
encoding
=
'utf-8'
)
args
.
vocabulary
=
codecs
.
open
(
args
.
vocabulary
.
name
,
encoding
=
"utf-8"
)
if
args
.
vocabulary
:
if
args
.
vocabulary
:
vocabulary
=
read_vocabulary
(
vocabulary
=
read_vocabulary
(
args
.
vocabulary
,
args
.
vocabulary_threshold
)
args
.
vocabulary
,
args
.
vocabulary_threshold
)
else
:
else
:
vocabulary
=
None
vocabulary
=
None
...
@@ -322,4 +373,4 @@ if __name__ == '__main__':
...
@@ -322,4 +373,4 @@ if __name__ == '__main__':
for
line
in
args
.
input
:
for
line
in
args
.
input
:
args
.
output
.
write
(
bpe
.
segment
(
line
).
strip
())
args
.
output
.
write
(
bpe
.
segment
(
line
).
strip
())
args
.
output
.
write
(
'
\n
'
)
args
.
output
.
write
(
"
\n
"
)
triton_models/tokenizer/1/model.py
View file @
d0049da2
...
@@ -6,8 +6,73 @@ from .apply_bpe import BPE
...
@@ -6,8 +6,73 @@ from .apply_bpe import BPE
from
ilstokenizer
import
tokenizer
from
ilstokenizer
import
tokenizer
import
triton_python_backend_utils
as
pb_utils
import
triton_python_backend_utils
as
pb_utils
class
TritonPythonModel
:
class
TritonPythonModel
:
def
initialize
(
self
,
args
):
self
.
target_dtype
,
self
.
bpes
=
pb_utils
.
triton_string_to_numpy
(
pb_utils
.
get_output_config_by_name
(
json
.
loads
(
args
[
"model_config"
]),
"INPUT_TEXT_TOKENIZED"
)[
"data_type"
]),
{
fname
.
rsplit
(
'/'
,
maxsplit
=
1
)[
-
1
][:
-
len
(
'.src'
)]:
BPE
(
open
(
fname
,
'r'
,
encoding
=
'utf-8'
))
for
fname
in
iglob
(
f'
{
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
}
/bpe_src/*.src'
)}
def
initialize
(
self
,
args
):
def
preprocess_text
(
self
,
text
,
source_lang
,
target_lang
):
return
f"<to-gu>
{
text
}
<to-gu>"
if
source_lang
==
'en'
and
target_lang
==
'gu'
else
text
self
.
target_dtype
,
self
.
bpes
=
pb_utils
.
triton_string_to_numpy
(
def
execute
(
self
,
requests
):
return
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"INPUT_TEXT_TOKENIZED"
,
numpy
.
array
([[
tokenized_sent
]
for
tokenized_sent
in
tokenized_sents
],
dtype
=
self
.
target_dtype
))])
for
tokenized_sents
in
((
self
.
bpes
[
f"
{
input_language_id
[
0
].
decode
(
'utf-8'
)
}
-
{
output_language_id
[
0
].
decode
(
'utf-8'
)
}
"
].
segment
(
self
.
preprocess_text
(
tokenizer
.
tokenize
(
input_text
[
0
].
decode
(
'utf-8'
).
lower
()),
input_language_id
[
0
].
decode
(
'utf-8'
),
output_language_id
[
0
].
decode
(
'utf-8'
))).
strip
()
for
input_text
,
input_language_id
,
output_language_id
in
zip
(
input_texts
.
as_numpy
(),
input_language_ids
.
as_numpy
(),
output_language_ids
.
as_numpy
()))
for
input_texts
,
input_language_ids
,
output_language_ids
in
((
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_TEXT"
),
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_LANGUAGE_ID"
),
pb_utils
.
get_input_tensor_by_name
(
request
,
"OUTPUT_LANGUAGE_ID"
))
for
request
in
requests
))]
pb_utils
.
get_output_config_by_name
(
def
finalize
(
self
):
pass
json
.
loads
(
args
[
"model_config"
]),
"INPUT_TEXT_TOKENIZED"
\ No newline at end of file
)[
"data_type"
]
),
{
fname
.
rsplit
(
"/"
,
maxsplit
=
1
)[
-
1
][:
-
len
(
".src"
)]:
BPE
(
open
(
fname
,
"r"
,
encoding
=
"utf-8"
)
)
for
fname
in
iglob
(
f"
{
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
}
/bpe_src/*.src"
)
}
def
preprocess_text
(
self
,
text
,
source_lang
,
target_lang
):
return
(
f"<to-gu>
{
text
}
<to-gu>"
if
source_lang
==
"en"
and
target_lang
==
"gu"
else
text
)
def
execute
(
self
,
requests
):
return
[
pb_utils
.
InferenceResponse
(
output_tensors
=
[
pb_utils
.
Tensor
(
"INPUT_TEXT_TOKENIZED"
,
numpy
.
array
(
[[
tokenized_sent
]
for
tokenized_sent
in
tokenized_sents
],
dtype
=
self
.
target_dtype
,
),
)
]
)
for
tokenized_sents
in
(
(
self
.
bpes
[
f"
{
input_language_id
[
0
].
decode
(
'utf-8'
)
}
-
{
output_language_id
[
0
].
decode
(
'utf-8'
)
}
"
]
.
segment
(
self
.
preprocess_text
(
tokenizer
.
tokenize
(
input_text
[
0
].
decode
(
"utf-8"
).
lower
()),
input_language_id
[
0
].
decode
(
"utf-8"
),
output_language_id
[
0
].
decode
(
"utf-8"
),
)
)
.
strip
()
for
input_text
,
input_language_id
,
output_language_id
in
zip
(
input_texts
.
as_numpy
(),
input_language_ids
.
as_numpy
(),
output_language_ids
.
as_numpy
(),
)
)
for
input_texts
,
input_language_ids
,
output_language_ids
in
(
(
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_TEXT"
),
pb_utils
.
get_input_tensor_by_name
(
request
,
"INPUT_LANGUAGE_ID"
),
pb_utils
.
get_input_tensor_by_name
(
request
,
"OUTPUT_LANGUAGE_ID"
),
)
for
request
in
requests
)
)
]
def
finalize
(
self
):
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment