Commit 85fac48c authored by priyank's avatar priyank

updated tokenizer

parent 801d5361
......@@ -10,6 +10,7 @@ use ILMT::URD::HIN::SSFAPI::shakti_tree_api;
sub process {
my %par = @_;
utf8::encode($par{'data'});
my $input = $par{'data'};
my $db_file = __DIR__ . "/Prune/mapping.dat";
......@@ -49,6 +50,7 @@ sub process {
select(OUTFILE);
printstory();
select(STDOUT);
utf8::decode($result);
return $result;
}
......
......@@ -4,6 +4,13 @@ use warnings;
use Dir::Self;
use Data::Dumper;
use IPC::Run qw(run);
use List::UtilsBy qw(max_by);
use File::Temp qw/ tempfile /;
use File::Slurp qw( slurp );
my $cwd = __DIR__;
my %daemons = (
"tokenizer" => {
"path" => "ind-tokz",
......@@ -15,22 +22,18 @@ my %daemons = (
sub process {
my %args = @_;
utf8::encode($args{data});
my $sentences = call_daemon("tokenizer", $args{data});
open INFILE, '<', \$sentences or die $!;
my $result = "";
my $ctr = 0;
while (my $line = <INFILE>) {
$ctr ++;
$result .= "<Sentence id=\"$ctr\">\n";
my @words = split ' ', $line;
foreach my $index (0..$#words) {
$result .= $index + 1 . "\t$words[$index]\tunk\n";
}
$result .= "</Sentence>";
}
close INFILE;
utf8::decode($result);
return $result;
my ($fh2, $filename2) = tempfile("tokenizer_inputXXXX", DIR => "/tmp", SUFFIX => ".tmp");
print $fh2 $args{"data"};
close($fh2);
my $token_out;
run ["python", "$cwd/tokenize.py", $filename2], ">", \$token_out;
unlink $filename2 or die "Couldn't delete temp file! $filename2";
utf8::decode($token_out);
return $token_out;
}
sub run_daemons {
......@@ -62,6 +65,6 @@ sub call_daemon {
return $result;
}
run_daemons(("tokenizer"));
#run_daemons(("tokenizer"));
1;
package ILMT::URD::HIN::Tokenizer;
use strict;
use warnings;
use Dir::Self;
use Data::Dumper;
my %daemons = (
"tokenizer" => {
"path" => "ind-tokz",
"args" => "--l urd --s --daemonize --port",
"port" => "31001"
}
);
sub process {
my %args = @_;
utf8::encode($args{data});
my $sentences = call_daemon("tokenizer", $args{data});
open INFILE, '<', \$sentences or die $!;
my $result = "";
my $ctr = 0;
while (my $line = <INFILE>) {
$ctr ++;
$result .= "<Sentence id=\"$ctr\">\n";
my @words = split ' ', $line;
foreach my $index (0..$#words) {
$result .= $index + 1 . "\t$words[$index]\tunk\n";
}
$result .= "</Sentence>";
}
close INFILE;
utf8::decode($result);
return $result;
}
sub run_daemons {
my @daemon_names = @_;
foreach my $daemon_name (@daemon_names) {
my %daemon = %{$daemons{$daemon_name}};
my $cmd = "$daemon{path} $daemon{args} $daemon{port} &";
my $runfile = __DIR__ . "/run/${daemon_name}_$daemon{port}";
system("flock -e -w 0.01 $runfile -c '$cmd'") == 0
or warn "[" . __PACKAGE__ . "]: Port $daemon{port} maybe unavailable! $?\n";
}
}
sub call_daemon {
my ($daemon_name, $input) = @_;
my $port = $daemons{$daemon_name}{port};
my ($socket, $client_socket);
$socket = new IO::Socket::INET (
PeerHost => '127.0.0.1',
PeerPort => $port,
Proto => 'tcp',
) or die "ERROR in Socket Creation : $!\n";
$socket->send("$input\n");
my $result = "";
while (my $line = $socket->getline) {
$result .= $line;
}
$socket->close();
return $result;
}
run_daemons(("tokenizer"));
1;
import os, sys, codecs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by
@author: priyank
'''
def tokenizer(text, ind):
"""Tokenize the text only on space."""
tokens = text.split()
tokens_ssf = [str(index + 1) + '\t' + token + '\tunk' for index, token in enumerate(tokens)]
tokens_ssf_with_sentence = ['<Sentence id="'+str(ind+1)+'">'] + tokens_ssf + ['</Sentence>']
return '\n'.join(tokens_ssf_with_sentence)
f = codecs.open(sys.argv[1], "rb", "utf-8")
lines = f.readlines()
f.close()
finalOutput = ""
ii = 0
for line in lines:
line = line.strip()
if line:
finalOutput = finalOutput + tokenizer(line, (ii)) + "\n"
ii = ii + 1
print (finalOutput.encode('utf-8'))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment