From 85fac48c9b476d8d3c6e1fe4aaeb10a9ce6553c0 Mon Sep 17 00:00:00 2001 From: priyank gupta Date: Tue, 17 May 2022 23:18:43 +0530 Subject: [PATCH] updated tokenizer --- .../lib/ILMT/URD/HIN/Prune.pm | 2 + .../lib/ILMT/URD/HIN/Tokenizer.pm | 37 +++++----- .../lib/ILMT/URD/HIN/Tokenizer.pm-old | 67 +++++++++++++++++++ .../lib/ILMT/URD/HIN/tokenize.py | 28 ++++++++ 4 files changed, 117 insertions(+), 17 deletions(-) create mode 100644 modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm-old create mode 100644 modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/tokenize.py diff --git a/modules/ILMT-URD-HIN-Prune/lib/ILMT/URD/HIN/Prune.pm b/modules/ILMT-URD-HIN-Prune/lib/ILMT/URD/HIN/Prune.pm index f72b4e0..251090e 100644 --- a/modules/ILMT-URD-HIN-Prune/lib/ILMT/URD/HIN/Prune.pm +++ b/modules/ILMT-URD-HIN-Prune/lib/ILMT/URD/HIN/Prune.pm @@ -10,6 +10,7 @@ use ILMT::URD::HIN::SSFAPI::shakti_tree_api; sub process { my %par = @_; + utf8::encode($par{'data'}); my $input = $par{'data'}; my $db_file = __DIR__ . "/Prune/mapping.dat"; @@ -49,6 +50,7 @@ sub process { select(OUTFILE); printstory(); select(STDOUT); + utf8::decode($result); return $result; } diff --git a/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm b/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm index f2abb57..16609fe 100644 --- a/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm +++ b/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm @@ -4,6 +4,13 @@ use warnings; use Dir::Self; use Data::Dumper; +use IPC::Run qw(run); +use List::UtilsBy qw(max_by); +use File::Temp qw/ tempfile /; +use File::Slurp qw( slurp ); + +my $cwd = __DIR__; + my %daemons = ( "tokenizer" => { "path" => "ind-tokz", @@ -15,22 +22,18 @@ my %daemons = ( sub process { my %args = @_; utf8::encode($args{data}); - my $sentences = call_daemon("tokenizer", $args{data}); - open INFILE, '<', \$sentences or die $!; - my $result = ""; - my $ctr = 0; - while (my $line = ) { - $ctr ++; - $result .= "\n"; - my @words = split ' ', $line; - foreach my $index (0..$#words) { - $result .= $index + 1 . "\t$words[$index]\tunk\n"; - } - $result .= ""; - } - close INFILE; - utf8::decode($result); - return $result; + + my ($fh2, $filename2) = tempfile("tokenizer_inputXXXX", DIR => "/tmp", SUFFIX => ".tmp"); + print $fh2 $args{"data"}; + close($fh2); + + my $token_out; + run ["python", "$cwd/tokenize.py", $filename2], ">", \$token_out; + + unlink $filename2 or die "Couldn't delete temp file! $filename2"; + + utf8::decode($token_out); + return $token_out; } sub run_daemons { @@ -62,6 +65,6 @@ sub call_daemon { return $result; } -run_daemons(("tokenizer")); +#run_daemons(("tokenizer")); 1; diff --git a/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm-old b/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm-old new file mode 100644 index 0000000..f2abb57 --- /dev/null +++ b/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/Tokenizer.pm-old @@ -0,0 +1,67 @@ +package ILMT::URD::HIN::Tokenizer; +use strict; +use warnings; +use Dir::Self; +use Data::Dumper; + +my %daemons = ( + "tokenizer" => { + "path" => "ind-tokz", + "args" => "--l urd --s --daemonize --port", + "port" => "31001" + } +); + +sub process { + my %args = @_; + utf8::encode($args{data}); + my $sentences = call_daemon("tokenizer", $args{data}); + open INFILE, '<', \$sentences or die $!; + my $result = ""; + my $ctr = 0; + while (my $line = ) { + $ctr ++; + $result .= "\n"; + my @words = split ' ', $line; + foreach my $index (0..$#words) { + $result .= $index + 1 . "\t$words[$index]\tunk\n"; + } + $result .= ""; + } + close INFILE; + utf8::decode($result); + return $result; +} + +sub run_daemons { + my @daemon_names = @_; + foreach my $daemon_name (@daemon_names) { + my %daemon = %{$daemons{$daemon_name}}; + my $cmd = "$daemon{path} $daemon{args} $daemon{port} &"; + my $runfile = __DIR__ . "/run/${daemon_name}_$daemon{port}"; + system("flock -e -w 0.01 $runfile -c '$cmd'") == 0 + or warn "[" . __PACKAGE__ . "]: Port $daemon{port} maybe unavailable! $?\n"; + } +} + +sub call_daemon { + my ($daemon_name, $input) = @_; + my $port = $daemons{$daemon_name}{port}; + my ($socket, $client_socket); + $socket = new IO::Socket::INET ( + PeerHost => '127.0.0.1', + PeerPort => $port, + Proto => 'tcp', + ) or die "ERROR in Socket Creation : $!\n"; + $socket->send("$input\n"); + my $result = ""; + while (my $line = $socket->getline) { + $result .= $line; + } + $socket->close(); + return $result; +} + +run_daemons(("tokenizer")); + +1; diff --git a/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/tokenize.py b/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/tokenize.py new file mode 100644 index 0000000..6b86a1b --- /dev/null +++ b/modules/ILMT-URD-HIN-Tokenizer/lib/ILMT/URD/HIN/tokenize.py @@ -0,0 +1,28 @@ +import os, sys, codecs +#!/usr/bin/env python +# -*- coding: utf-8 -*- +''' +Created by + +@author: priyank +''' + +def tokenizer(text, ind): + """Tokenize the text only on space.""" + tokens = text.split() + tokens_ssf = [str(index + 1) + '\t' + token + '\tunk' for index, token in enumerate(tokens)] + tokens_ssf_with_sentence = [''] + tokens_ssf + [''] + return '\n'.join(tokens_ssf_with_sentence) + +f = codecs.open(sys.argv[1], "rb", "utf-8") +lines = f.readlines() +f.close() + +finalOutput = "" +ii = 0 +for line in lines: + line = line.strip() + if line: + finalOutput = finalOutput + tokenizer(line, (ii)) + "\n" + ii = ii + 1 +print (finalOutput.encode('utf-8')) -- GitLab