Commit ac27fbf5 authored by priyank's avatar priyank

updated tokenizer

parent 6cc343ea
...@@ -49,8 +49,11 @@ sub process { ...@@ -49,8 +49,11 @@ sub process {
my %args = @_; my %args = @_;
utf8::encode($args{"data"}); utf8::encode($args{"data"});
foreach my $submodule (@dispatch_seq) { foreach my $submodule (@dispatch_seq) {
#print Dumper($submodule);
$args{'data'} = __PACKAGE__->can($submodule)->(%args); $args{'data'} = __PACKAGE__->can($submodule)->(%args);
#print Dumper($args{'data'});
} }
utf8::decode($args{"data"}); utf8::decode($args{"data"});
return $args{"data"}; return $args{"data"};
} }
......
package ILMT::TEL::HIN::PickOneMorph; package ILMT::TEL::HIN::PickOneMorph;
use strict; #use strict;
use warnings; #use warnings;
use Dir::Self; use Dir::Self;
use Data::Dumper; use Data::Dumper;
use ILMT::TEL::HIN::SSFAPI::feature_filter; use ILMT::TEL::HIN::SSFAPI::feature_filter;
......
...@@ -3,7 +3,12 @@ use strict; ...@@ -3,7 +3,12 @@ use strict;
use warnings; use warnings;
use Dir::Self; use Dir::Self;
use Data::Dumper; use Data::Dumper;
use IPC::Run qw(run);
use List::UtilsBy qw(max_by);
use File::Temp qw/ tempfile /;
use File::Slurp qw( slurp );
my $cwd = __DIR__;
my %daemons = ( my %daemons = (
"tokenizer" => { "tokenizer" => {
"path" => "ind-tokz", "path" => "ind-tokz",
...@@ -15,22 +20,18 @@ my %daemons = ( ...@@ -15,22 +20,18 @@ my %daemons = (
sub process { sub process {
my %args = @_; my %args = @_;
utf8::encode($args{data}); utf8::encode($args{data});
my $sentences = call_daemon("tokenizer", $args{data});
open INFILE, '<', \$sentences or die $!; my ($fh2, $filename2) = tempfile("tokenizer_inputXXXX", DIR => "/tmp", SUFFIX => ".tmp");
my $result = ""; print $fh2 $args{"data"};
my $ctr = 0; close($fh2);
while (my $line = <INFILE>) {
$ctr ++; my $token_out;
$result .= "<Sentence id=\"$ctr\">\n"; run ["python", "$cwd/tokenize.py", $filename2], ">", \$token_out;
my @words = split ' ', $line;
foreach my $index (0..$#words) { unlink $filename2 or die "Couldn't delete temp file! $filename2";
$result .= $index + 1 . "\t$words[$index]\tunk\n";
} utf8::decode($token_out);
$result .= "</Sentence>"; return $token_out;
}
close INFILE;
utf8::decode($result);
return $result;
} }
sub run_daemons { sub run_daemons {
...@@ -62,6 +63,6 @@ sub call_daemon { ...@@ -62,6 +63,6 @@ sub call_daemon {
return $result; return $result;
} }
run_daemons(("tokenizer")); #run_daemons(("tokenizer"));
1; 1;
package ILMT::TEL::HIN::Tokenizer;
use strict;
use warnings;
use Dir::Self;
use Data::Dumper;
my %daemons = (
"tokenizer" => {
"path" => "ind-tokz",
"args" => "--l tel --s --daemonize --port",
"port" => "61001"
}
);
sub process {
my %args = @_;
utf8::encode($args{data});
my $sentences = call_daemon("tokenizer", $args{data});
open INFILE, '<', \$sentences or die $!;
my $result = "";
my $ctr = 0;
while (my $line = <INFILE>) {
$ctr ++;
$result .= "<Sentence id=\"$ctr\">\n";
my @words = split ' ', $line;
foreach my $index (0..$#words) {
$result .= $index + 1 . "\t$words[$index]\tunk\n";
}
$result .= "</Sentence>";
}
close INFILE;
utf8::decode($result);
return $result;
}
sub run_daemons {
my @daemon_names = @_;
foreach my $daemon_name (@daemon_names) {
my %daemon = %{$daemons{$daemon_name}};
my $cmd = "$daemon{path} $daemon{args} $daemon{port} &";
my $runfile = __DIR__ . "/run/${daemon_name}_$daemon{port}";
system("flock -e -w 0.01 $runfile -c '$cmd'") == 0
or warn "[" . __PACKAGE__ . "]: Port $daemon{port} maybe unavailable! $?\n";
}
}
sub call_daemon {
my ($daemon_name, $input) = @_;
my $port = $daemons{$daemon_name}{port};
my ($socket, $client_socket);
$socket = new IO::Socket::INET (
PeerHost => '127.0.0.1',
PeerPort => $port,
Proto => 'tcp',
) or die "ERROR in Socket Creation : $!\n";
$socket->send("$input\n");
my $result = "";
while (my $line = $socket->getline) {
$result .= $line;
}
$socket->close();
return $result;
}
run_daemons(("tokenizer"));
1;
import os, sys, codecs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by
@author: priyank
'''
def tokenizer(text, ind):
"""Tokenize the text only on space."""
tokens = text.split()
tokens_ssf = [str(index + 1) + '\t' + token + '\tunk' for index, token in enumerate(tokens)]
tokens_ssf_with_sentence = ['<Sentence id="'+str(ind+1)+'">'] + tokens_ssf + ['</Sentence>']
return '\n'.join(tokens_ssf_with_sentence)
f = codecs.open(sys.argv[1], "rb", "utf-8")
lines = f.readlines()
f.close()
finalOutput = ""
ii = 0
for line in lines:
line = line.strip()
if line:
finalOutput = finalOutput + tokenizer(line, (ii)) + "\n"
ii = ii + 1
print (finalOutput.encode('utf-8'))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment