tokenizer_indic.pl 2.71 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
#!/usr/bin/perl
use Getopt::Long;
use File::Basename;

GetOptions('help!'=>\$help, 'lang=s'=>\$lang, 'str_input=s'=>\$str_input, 'input=s'=>\$input, 'output:s'=>\$output, 'version'=>\$ver, 'jflag:s'=>\$jflag);
print "Unprocessed by Getopt::Long\n" if $ARGV[0];
foreach (@ARGV) {
       print "$_\n";
       exit(0);
}

if($help eq 1) {
	print "usage :\nperl  tokenizer_indic.pl -l=hin -i=input-file -o=output \n\n";
	print " -l, --lang=[hin|tel|...]	: select the language 3 letter code (ISO-639)\n";
	print " -s, --str_input=<input-string>	: give input string\n";
	print " -i, --input=<input-file>	: give input file\n";
	print " -o, --output=<output-file>	: give output file\n";
	print " -j, --jflag=[yes|no]		: give input to print -JOIN in between multiwords. default is yes\n";
	print "Report bugs to <rashid101b\@gmail.com>\n";
        exit(0);
}

if($ver eq 1) {
        print "tokenizer-indic version 1.8 \n";
        exit(0);
}

# for ~/Desktop/ or ~/myinput/ directory issue fix
$home = $ENV{"HOME"};
$input =~ s/^~/$home/;
$output =~ s/^~/$home/;

$path = dirname($0);

# Acronym file path based on language
$acr_file = $path."/data/".$lang.".acr";

#print "Path: $path\n";
#print "Input String: $str_input\n";
#print "Input File: $input\n";
#print "Output: $output\n";
#print "Acronym File : $acr_file\n";
#print "JOIN FLAG : $jflag\n";

# tokenizer-indic lib
require "$path/lib/tokenizer.pl";

#binmode (STDOUT, ":utf8");
#binmode (STDIN, ":utf8");

sub tokenizer_indic {
	my ($lang, $str_input, $input, $output) = @_;

        if ($output ne "") {
            open (OUTFILE, ">$output") or die "$!";
        }

	if ($str_input ne "" and lc($lang) ne "") {
		my $tokenize_str = &sentence_mark($lang, $str_input, $acr_file, $jflag);
		my $final_ssf = &token_split($tokenize_str);
		if ($output ne "") {
			binmode (OUTFILE, ":utf8");
			print OUTFILE $final_ssf;
		}
		else {
			binmode (STDOUT, ":utf8");
			print $final_ssf;
		}
	}
	elsif ($input ne "" and lc($lang) ne "") {
		# Input file open if input string is not specified
		open (INFILE, "<utf8", $input) or die "$!";
		#$line = join(" ", grep{ chomp $_; } <INFILE>);
		my $sent_str = "";
		while ($line = <INFILE>) {
		#$line = join("",  <INFILE>);
		if ($line !~ m/^$/) {
			$tokenize_str = &sentence_mark($lang, $line, $acr_file, $jflag);
			$sent_str = $sent_str.$tokenize_str;
		}
		}
		my $final_ssf = &token_split($sent_str);
		if ($output ne "") {
			binmode (OUTFILE, ":utf8");
                        print OUTFILE $final_ssf;
                }
                else {
                        print $final_ssf;
                }

	}
	else {
		print "\nOptions Missing\n";
		print "usage :\nperl  $0 -l=hin -i=input-file -o=output \n\n";
	}
}
&tokenizer_indic($lang, $str_input, $input, $output);