Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
I
ilmt-api-tel-shallowparser
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Operations
Operations
Metrics
Analytics
Analytics
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
reva-codes
ilmt-api-tel-shallowparser
Commits
ac27fbf5
Commit
ac27fbf5
authored
May 17, 2022
by
priyank
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated tokenizer
parent
6cc343ea
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
118 additions
and
19 deletions
+118
-19
modules/ILMT-TEL-HIN-Morph/lib/ILMT/TEL/HIN/Morph.pm
modules/ILMT-TEL-HIN-Morph/lib/ILMT/TEL/HIN/Morph.pm
+3
-0
modules/ILMT-TEL-HIN-PickOneMorph/lib/ILMT/TEL/HIN/PickOneMorph.pm
...LMT-TEL-HIN-PickOneMorph/lib/ILMT/TEL/HIN/PickOneMorph.pm
+2
-2
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/Tokenizer.pm
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/Tokenizer.pm
+18
-17
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/Tokenizer.pm-old
.../ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/Tokenizer.pm-old
+67
-0
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/tokenize.py
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/tokenize.py
+28
-0
No files found.
modules/ILMT-TEL-HIN-Morph/lib/ILMT/TEL/HIN/Morph.pm
View file @
ac27fbf5
...
...
@@ -49,8 +49,11 @@ sub process {
my
%
args
=
@_
;
utf8::
encode
(
$args
{"
data
"});
foreach
my
$submodule
(
@dispatch_seq
)
{
#print Dumper($submodule);
$args
{'
data
'}
=
__PACKAGE__
->
can
(
$submodule
)
->
(
%
args
);
#print Dumper($args{'data'});
}
utf8::
decode
(
$args
{"
data
"});
return
$args
{"
data
"};
}
...
...
modules/ILMT-TEL-HIN-PickOneMorph/lib/ILMT/TEL/HIN/PickOneMorph.pm
View file @
ac27fbf5
package
ILMT::TEL::HIN::
PickOneMorph
;
use
strict
;
use
warnings
;
#
use strict;
#
use warnings;
use
Dir::
Self
;
use
Data::
Dumper
;
use
ILMT::TEL::HIN::SSFAPI::
feature_filter
;
...
...
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/Tokenizer.pm
View file @
ac27fbf5
...
...
@@ -3,7 +3,12 @@ use strict;
use
warnings
;
use
Dir::
Self
;
use
Data::
Dumper
;
use
IPC::
Run
qw(run)
;
use
List::
UtilsBy
qw(max_by)
;
use
File::
Temp
qw/ tempfile /
;
use
File::
Slurp
qw( slurp )
;
my
$cwd
=
__DIR__
;
my
%
daemons
=
(
"
tokenizer
"
=>
{
"
path
"
=>
"
ind-tokz
",
...
...
@@ -15,22 +20,18 @@ my %daemons = (
sub
process
{
my
%
args
=
@_
;
utf8::
encode
(
$args
{
data
});
my
$sentences
=
call_daemon
("
tokenizer
",
$args
{
data
});
open
INFILE
,
'
<
',
\
$sentences
or
die
$!
;
my
$result
=
"";
my
$ctr
=
0
;
while
(
my
$line
=
<
INFILE
>
)
{
$ctr
++
;
$result
.=
"
<Sentence id=
\"
$ctr
\"
>
\n
";
my
@words
=
split
'
',
$line
;
foreach
my
$index
(
0
..
$#words
)
{
$result
.=
$index
+
1
.
"
\t
$words
[
$index
]
\t
unk
\n
";
}
$result
.=
"
</Sentence>
";
}
close
INFILE
;
utf8::
decode
(
$result
);
return
$result
;
my
(
$fh2
,
$filename2
)
=
tempfile
("
tokenizer_inputXXXX
",
DIR
=>
"
/tmp
",
SUFFIX
=>
"
.tmp
");
print
$fh2
$args
{"
data
"};
close
(
$fh2
);
my
$token_out
;
run
["
python
",
"
$cwd
/tokenize.py
",
$filename2
],
"
>
",
\
$token_out
;
unlink
$filename2
or
die
"
Couldn't delete temp file!
$filename2
";
utf8::
decode
(
$token_out
);
return
$token_out
;
}
sub
run_daemons
{
...
...
@@ -62,6 +63,6 @@ sub call_daemon {
return
$result
;
}
run_daemons
(("
tokenizer
"));
#
run_daemons(("tokenizer"));
1
;
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/Tokenizer.pm-old
0 → 100755
View file @
ac27fbf5
package
ILMT
::
TEL
::
HIN
::
Tokenizer
;
use
strict
;
use
warnings
;
use
Dir
::
Self
;
use
Data
::
Dumper
;
my
%
daemons
=
(
"tokenizer"
=>
{
"path"
=>
"ind-tokz"
,
"args"
=>
"--l tel --s --daemonize --port"
,
"port"
=>
"61001"
}
);
sub
process
{
my
%
args
=
@
_
;
utf8
::
encode
($
args
{
data
});
my
$
sentences
=
call_daemon
(
"tokenizer"
,
$
args
{
data
});
open
INFILE
,
'<'
,
\$
sentences
or
die
$
!;
my
$
result
=
""
;
my
$
ctr
=
0
;
while
(
my
$
line
=
<
INFILE
>)
{
$
ctr
++;
$
result
.=
"<Sentence id=
\"
$ctr
\"
>
\n
"
;
my
@
words
=
split
' '
,
$
line
;
foreach
my
$
index
(
0.
.$#
words
)
{
$
result
.=
$
index
+
1
.
"
\t
$words[$index]
\t
unk
\n
"
;
}
$
result
.=
"</Sentence>"
;
}
close
INFILE
;
utf8
::
decode
($
result
);
return
$
result
;
}
sub
run_daemons
{
my
@
daemon_names
=
@
_
;
foreach
my
$
daemon_name
(@
daemon_names
)
{
my
%
daemon
=
%{$
daemons
{$
daemon_name
}};
my
$
cmd
=
"$daemon{path} $daemon{args} $daemon{port} &"
;
my
$
runfile
=
__DIR__
.
"/run/${daemon_name}_$daemon{port}"
;
system
(
"flock -e -w 0.01 $runfile -c '$cmd'"
)
==
0
or
warn
"["
.
__PACKAGE__
.
"]: Port $daemon{port} maybe unavailable! $?
\n
"
;
}
}
sub
call_daemon
{
my
($
daemon_name
,
$
input
)
=
@
_
;
my
$
port
=
$
daemons
{$
daemon_name
}{
port
};
my
($
socket
,
$
client_socket
);
$
socket
=
new
IO
::
Socket
::
INET
(
PeerHost
=>
'127.0.0.1'
,
PeerPort
=>
$
port
,
Proto
=>
'tcp'
,
)
or
die
"ERROR in Socket Creation : $!
\n
"
;
$
socket
->
send
(
"$input
\n
"
);
my
$
result
=
""
;
while
(
my
$
line
=
$
socket
->
getline
)
{
$
result
.=
$
line
;
}
$
socket
->
close
();
return
$
result
;
}
run_daemons
((
"tokenizer"
));
1
;
modules/ILMT-TEL-HIN-Tokenizer/lib/ILMT/TEL/HIN/tokenize.py
0 → 100644
View file @
ac27fbf5
import
os
,
sys
,
codecs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by
@author: priyank
'''
def
tokenizer
(
text
,
ind
):
"""Tokenize the text only on space."""
tokens
=
text
.
split
()
tokens_ssf
=
[
str
(
index
+
1
)
+
'
\t
'
+
token
+
'
\t
unk'
for
index
,
token
in
enumerate
(
tokens
)]
tokens_ssf_with_sentence
=
[
'<Sentence id="'
+
str
(
ind
+
1
)
+
'">'
]
+
tokens_ssf
+
[
'</Sentence>'
]
return
'
\n
'
.
join
(
tokens_ssf_with_sentence
)
f
=
codecs
.
open
(
sys
.
argv
[
1
],
"rb"
,
"utf-8"
)
lines
=
f
.
readlines
()
f
.
close
()
finalOutput
=
""
ii
=
0
for
line
in
lines
:
line
=
line
.
strip
()
if
line
:
finalOutput
=
finalOutput
+
tokenizer
(
line
,
(
ii
))
+
"
\n
"
ii
=
ii
+
1
print
(
finalOutput
.
encode
(
'utf-8'
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment