tokenizer.pl 6.97 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
sub sentence_mark {
my $lang = $_[0];
my $string = $_[1];
my $prefix_file = $_[2];
my $joinflag = $_[3];

#print "joinflag:$joinflag\n";
binmode(STDOUT, ":utf8");
# Print the raw input string
#print "Input String to sentence_mark function : ", $string, "\n";
#open my $ACR_FILE, $prefix_file or 
open my $ACR_FILE, "<utf8", $prefix_file or 
die "Error - could not open file '$prefix_file': $!";

# print join " ", <$ACR_FILE>;
# Crux of Acronym Handler
my %acr_hash = map {
               chomp $_; 
               my $acr = $_;
               $acr => sub { $_ =~ s{\.|'}{__}g;return $_; }->() } <$ACR_FILE>;
#my $acr_h_size = scalar keys %acr_hash;
#print "ACR-Hash-count: $acr_h_size\n";

# Reverse Hash to Substitute value to key
my %rev_acr_hash = reverse %acr_hash;
	# Handle emails in text 
	# $_ =~ s/\b([a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/__email__/g;
	my $email_c = 1;
	while($string =~ m/\b([a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/g) {
        my $em = $1;
        my $vl = "__email__".$email_c++;
        $string =~ s/\b$1\b/$vl/g;
        $acr_hash{$em} = $vl;
	}
	# Update reverse hash
	%rev_acr_hash = reverse %acr_hash;
	#print %acr_hash, "\n";
	#print %rev_acr_hash, "\n";

	# Handle ellipsis case  .. w1....w2 .. etc.
	#$string =~ s/([^\p{IsN}])([\.]{3,})([^\p{IsN}])/$1 $2 $3/g;

	# Handle website or web page in text
	#$_ =~ s/\b((https?:\/\/|ftp:\/\/|file:\/\/)*[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\/?[a-zA-Z0-9_.\-]*)\b/__weblink__/;
	my $web_c = 1;
	while($string =~ m/\b((https?:\/\/|ftp:\/\/|file:\/\/|www)+[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\/?[a-zA-Z0-9_.\-]*)\b/g) {
	my $wl = $1;
	my $vl = "__weblink__".$web_c++;
	$string =~ s/\b$1\b/$vl/g;
	$acr_hash{$wl} = $vl;
	}
	# Update reverse hash
	%rev_acr_hash = reverse %acr_hash;


	# Handle ellipsis ... and ..
	$string =~ s/([\.]{3,})/ __ELLIP3__ /g; 
	my $ellip3 = $1;
	$string =~ s/([\.]{2,2})/ __ELLIP2__ /g; 
	my $ellip2 = $1;

	# Handle hyphens --- and --
	$string =~ s/([\-]{3,})/ __HYPHEN3__ /g; 
	my $hyphen3 = $1;
	$string =~ s/([\-]{2,2})/ __HYPHEN2__ /g; 
	my $hyphen2 = $1;
	# Insert a space in text after . (dot) to handle acronym properly.
	# (don't put space in decimal number)
	$string =~ s/([^\p{IsN}])[\.]([^\p{IsN}])/$1\. $2/g;
	
	# to handle nA email: rashid101b@gmail.com.
	$string =~ s/(.*)([\.])$/$1 $2/g;

	# Seperate out "," except if within numbers (5,300)
	$string =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;

	# Seperate out "/" except if within numbers (1/2)
	$string =~ s/([^\p{IsN}])[\/]([^\p{IsN}])/$1 \/ $2/g;

	# Handle Sign of following
	$string =~ s/([^\p{IsN}])[:][-]$/$1 __SIGNF__/g;

	# Seperate out "-" except if within numbers (1-1-2013) [temporary to handle mwe ideal is w1 -JOIN w2 etc.]
	$string =~ s/([^\p{IsN}])( +)[-]([^\p{IsN}])/$1 __HYPHEN__ $3/g;
	$string =~ s/([^\p{IsN}])[-]( +)([^\p{IsN}])/$1 __HYPHEN__ $3/g;

	# Seperate out "-" except if within numbers (1-1-2013) [To handle mwe e.g. w1 -JOIN w2 etc.]
	#$string =~ s/([^\p{IsN}])([-]{1,1})([^\p{IsN}])/$1 -JOIN $3/g;
	if ($joinflag eq "" || lc($joinflag) eq "yes"){
	$string =~ s/([^\p{IsN}])([-]{1,1})([^\"\'\(\[])([^\p{IsN}])/$1 -JOIN $3$4/g;
	}
	else {
	$string =~ s/([^\p{IsN}])([-]{1,1})([^\"\'\(\[])([^\p{IsN}])/$1 - $3$4/g;
	}
	#$string =~ s/([^\p{IsN}])([^ ])[-]([^ \(\[\"\'-])([^\p{IsN}])/$1$2 -JOIN $3$4/g;
	

	# Insert visarga(\u0903) inside a word if colon(\u003A or :) exist within words 
	$string =~ s/([^\p{IsN}])[:]([^ \)\/])([^\p{IsN}])/$1\x{0903}$2$3/g;

	# Seperate out ":" except if within numbers (Mumbai: gateway of india.)
	$string =~ s/([^\p{IsN}])[:]([^\p{IsN}])/$1 : $2/g;

	# Separate multi comma in number 3,53,222 5,999.33 etc.
	$string =~ s/[,]( )/ ,$1/g;

	# Seperate out ";"
	$string =~ s/[;]( *)/ ;$1 /g;

	# turn `into '
	$string =~ s/\`/ \' /g;
	# turn '' into "
	$string =~ s/\'\'/ \" /g;

	# put space around brackets
	$string =~ s/([\(\)\[\]\{\}])/ $1 /g;

	# put space around question word, end of the sentences ?, . | \u0964,
	# except | (vertical bar) to handle || end of the sentence
	#$_ =~ s/([\?\x{0964}])/ $1 /g;

        # clean up extraneous spaces
        $string =~ s/ +/ /g;
        $string =~ s/^ //g;
        $string =~ s/ $//g;

	# Apply acr_hash to handle acronym occur in text
	my ($key, $value, $line, $rev_line);
	for(split (" ", $string)) {
        if(exists $acr_hash{$_}) {
            s/$_/$acr_hash{$_}/;
        }

        $line = $line."$_ "; 
	}

	$string = $line;
	# Print Acronym Marked String
	# print "Acronym Marked Output String: ", $string, "\n\n";

	# put space around symbols except hyphen (-) for multiword and ! for exclemation
	$string =~ s/([\#\$\%\^\&\*\+\=\<\>\'\"\!])/ $1 /g;
	# pu space for urdu punctuation ‘ (\u2018) ’ (\u2019) ، (\u060C)
	$string =~ s/([\x{2018}\x{2019}\x{060C}])/ $1 /g;

	$string = "<S> ".$string;
	#$string =~ s/([^\p{IsN}])([\.?\x{0964}|]+)([^\p{IsN}])/$1 $2<\/S><S>$3/g;
	# Sentence boundry marking improve by incorporating $3 to handle w1[.|?]" next sentence
	# urdu sentence end boundry added ۔ (\u06D4) ؟ (\u061F)
	$string =~ s/([^\p{IsDigit}])([\.?\x{06D4}\x{061F}\x{0964}|]+)( [\"\'])*([^\p{IsN}])/$1 $2$3<\/S><S>$4/g;

	if ($ellip3 ne ""){
	$string =~ s/__ELLIP3__/$ellip3/g;
	}
	else {
	$string =~ s/__ELLIP3__/.../g;
	}

	if ($ellip2 ne ""){
	$string =~ s/__ELLIP2__/$ellip2/g;
	}
	else {
	$string =~ s/__ELLIP2__/../g;
	}

	if ($hyphen3 ne ""){
	$string =~ s/__HYPHEN3__/$hyphen3/g;
	}
	else{
	$string =~ s/__HYPHEN3__/---/g;
	}
	
	if ($hyphen2 ne ""){
	$string =~ s/__HYPHEN2__/$hyphen2/g;
	}
	else{
	$string =~ s/__HYPHEN2__/--/g;
	}
	#print $string;
	#exit;

	$string =~ s/__HYPHEN__/-/g;

	$string =~ s/__SIGNF__/:-/g;

	$string =~ s/<\/S><S>\s*$/<\/S>/g;
	$string =~ s/(.*)([^(<\/S>)]\s)$/$1$2 <\/S>/g;
	$string =~ s/<\/S><\/S>$/<\/S>/g;

	# Apply reverse hash to store actual key occur in text
	for (split (" ", $string)) {
        
        if(exists $rev_acr_hash{$_}) {
            s/$_/$rev_acr_hash{$_}/;
        }

        $rev_line = $rev_line."$_ "; 
    }

	$string = $rev_line;
	# Print Sentence Marked String
	#print "Sentence Marked String : ", $string, "\n\n";
	return  $string;
}

# Split Token based on blank space
sub token_split {
	my $string = $_[0];
	#print "Input to token_split function: $string\n";
	my $sent_head = ""; my $sent_body = ""; my $sent_tail = ""; my $final_sent="";

	my @sentarr = split ("<\/S>",$string); # sentences split
	foreach my $sent (0..$#sentarr){
	$sentarr[$sent] =~ s/^\s*<S>\s*//g;
	$sentarr[$sent] =~ s/\s*$//g;
	#print "mysent",$sentarr[$sent],"\n";

	if ($sentarr[$sent] ne ""){
	my $sentnewno = $sent+1;
	#print "<Sentence\ id=\"$sentnewno\">\n";
	$sent_head = "<Sentence\ id=\"$sentnewno\">\n";
	my @tknarr = split(" ", $sentarr[$sent]);
	foreach my $tkn (0..$#tknarr){
		my $tknno = $tkn+1;
		if ($tknarr[$tkn] ne "" and $tknarr[$tkn] ne "<S>"){
			#print "$tkn\t$tknarr[$tkn]\tunk\n";
			$sent_body = $sent_body."$tknno\t$tknarr[$tkn]\tunk\n";
		}
	}
	#print "<\/Sentence>\n";
	$sent_tail = "<\/Sentence>\n\n";
	$final_sent = $final_sent.$sent_head.$sent_body.$sent_tail;
	# print "Final SSF : $final_sent";
	$sent_body = "";
	}

	}
	return $final_sent;
}

1;