Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
I
ilmt-api-hin-shallowparser
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Operations
Operations
Metrics
Analytics
Analytics
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Commits
Open sidebar
reva-codes
ilmt-api-hin-shallowparser
Commits
e8aab6b8
Commit
e8aab6b8
authored
May 17, 2022
by
priyank
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated tokenizer
parent
ac92f5af
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
115 additions
and
17 deletions
+115
-17
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/Tokenizer.pm
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/Tokenizer.pm
+20
-17
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/Tokenizer.pm-old
.../ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/Tokenizer.pm-old
+67
-0
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/tokenize.py
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/tokenize.py
+28
-0
No files found.
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/Tokenizer.pm
View file @
e8aab6b8
...
...
@@ -4,6 +4,13 @@ use warnings;
use
Dir::
Self
;
use
Data::
Dumper
;
use
IPC::
Run
qw(run)
;
use
List::
UtilsBy
qw(max_by)
;
use
File::
Temp
qw/ tempfile /
;
use
File::
Slurp
qw( slurp )
;
my
$cwd
=
__DIR__
;
my
%
daemons
=
(
"
tokenizer
"
=>
{
"
path
"
=>
"
ind-tokz
",
...
...
@@ -15,22 +22,18 @@ my %daemons = (
sub
process
{
my
%
args
=
@_
;
utf8::
encode
(
$args
{
data
});
my
$sentences
=
call_daemon
("
tokenizer
",
$args
{
data
});
open
INFILE
,
'
<
',
\
$sentences
or
die
$!
;
my
$result
=
"";
my
$ctr
=
0
;
while
(
my
$line
=
<
INFILE
>
)
{
$ctr
++
;
$result
.=
"
<Sentence id=
\"
$ctr
\"
>
\n
";
my
@words
=
split
'
',
$line
;
foreach
my
$index
(
0
..
$#words
)
{
$result
.=
$index
+
1
.
"
\t
$words
[
$index
]
\t
unk
\n
";
}
$result
.=
"
</Sentence>
";
}
close
INFILE
;
utf8::
decode
(
$result
);
return
$result
;
my
(
$fh2
,
$filename2
)
=
tempfile
("
tokenizer_inputXXXX
",
DIR
=>
"
/tmp
",
SUFFIX
=>
"
.tmp
");
print
$fh2
$args
{"
data
"};
close
(
$fh2
);
my
$token_out
;
run
["
python
",
"
$cwd
/tokenize.py
",
$filename2
],
"
>
",
\
$token_out
;
unlink
$filename2
or
die
"
Couldn't delete temp file!
$filename2
";
utf8::
decode
(
$token_out
);
return
$token_out
;
}
sub
run_daemons
{
...
...
@@ -62,6 +65,6 @@ sub call_daemon {
return
$result
;
}
run_daemons
(("
tokenizer
"));
#
run_daemons(("tokenizer"));
1
;
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/Tokenizer.pm-old
0 → 100644
View file @
e8aab6b8
package
ILMT
::
HIN
::
PAN
::
Tokenizer
;
use
strict
;
use
warnings
;
use
Dir
::
Self
;
use
Data
::
Dumper
;
my
%
daemons
=
(
"tokenizer"
=>
{
"path"
=>
"ind-tokz"
,
"args"
=>
"--l hin --s --daemonize --port"
,
"port"
=>
"12001"
}
);
sub
process
{
my
%
args
=
@
_
;
utf8
::
encode
($
args
{
data
});
my
$
sentences
=
call_daemon
(
"tokenizer"
,
$
args
{
data
});
open
INFILE
,
'<'
,
\$
sentences
or
die
$
!;
my
$
result
=
""
;
my
$
ctr
=
0
;
while
(
my
$
line
=
<
INFILE
>)
{
$
ctr
++;
$
result
.=
"<Sentence id=
\"
$ctr
\"
>
\n
"
;
my
@
words
=
split
' '
,
$
line
;
foreach
my
$
index
(
0.
.$#
words
)
{
$
result
.=
$
index
+
1
.
"
\t
$words[$index]
\t
unk
\n
"
;
}
$
result
.=
"</Sentence>"
;
}
close
INFILE
;
utf8
::
decode
($
result
);
return
$
result
;
}
sub
run_daemons
{
my
@
daemon_names
=
@
_
;
foreach
my
$
daemon_name
(@
daemon_names
)
{
my
%
daemon
=
%{$
daemons
{$
daemon_name
}};
my
$
cmd
=
"$daemon{path} $daemon{args} $daemon{port} &"
;
my
$
runfile
=
__DIR__
.
"/run/${daemon_name}_$daemon{port}"
;
system
(
"flock -e -w 0.01 $runfile -c '$cmd'"
)
==
0
or
warn
"["
.
__PACKAGE__
.
"]: Port $daemon{port} maybe unavailable! $?
\n
"
;
}
}
sub
call_daemon
{
my
($
daemon_name
,
$
input
)
=
@
_
;
my
$
port
=
$
daemons
{$
daemon_name
}{
port
};
my
($
socket
,
$
client_socket
);
$
socket
=
new
IO
::
Socket
::
INET
(
PeerHost
=>
'127.0.0.1'
,
PeerPort
=>
$
port
,
Proto
=>
'tcp'
,
)
or
die
"ERROR in Socket Creation : $!
\n
"
;
$
socket
->
send
(
"$input
\n
"
);
my
$
result
=
""
;
while
(
my
$
line
=
$
socket
->
getline
)
{
$
result
.=
$
line
;
}
$
socket
->
close
();
return
$
result
;
}
run_daemons
((
"tokenizer"
));
1
;
modules/ILMT-HIN-PAN-Tokenizer/lib/ILMT/HIN/PAN/tokenize.py
0 → 100644
View file @
e8aab6b8
import
os
,
sys
,
codecs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by
@author: priyank
'''
def
tokenizer
(
text
,
ind
):
"""Tokenize the text only on space."""
tokens
=
text
.
split
()
tokens_ssf
=
[
str
(
index
+
1
)
+
'
\t
'
+
token
+
'
\t
unk'
for
index
,
token
in
enumerate
(
tokens
)]
tokens_ssf_with_sentence
=
[
'<Sentence id="'
+
str
(
ind
+
1
)
+
'">'
]
+
tokens_ssf
+
[
'</Sentence>'
]
return
'
\n
'
.
join
(
tokens_ssf_with_sentence
)
f
=
codecs
.
open
(
sys
.
argv
[
1
],
"rb"
,
"utf-8"
)
lines
=
f
.
readlines
()
f
.
close
()
finalOutput
=
""
ii
=
0
for
line
in
lines
:
line
=
line
.
strip
()
if
line
:
finalOutput
=
finalOutput
+
tokenizer
(
line
,
(
ii
))
+
"
\n
"
ii
=
ii
+
1
print
(
finalOutput
.
encode
(
'utf-8'
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment