sub sentence_mark { my $lang = $_[0]; my $string = $_[1]; my $prefix_file = $_[2]; my $joinflag = $_[3]; #print "joinflag:$joinflag\n"; binmode(STDOUT, ":utf8"); # Print the raw input string #print "Input String to sentence_mark function : ", $string, "\n"; #open my $ACR_FILE, $prefix_file or open my $ACR_FILE, "; # Crux of Acronym Handler my %acr_hash = map { chomp $_; my $acr = $_; $acr => sub { $_ =~ s{\.|'}{__}g;return $_; }->() } <$ACR_FILE>; #my $acr_h_size = scalar keys %acr_hash; #print "ACR-Hash-count: $acr_h_size\n"; # Reverse Hash to Substitute value to key my %rev_acr_hash = reverse %acr_hash; # Handle emails in text # $_ =~ s/\b([a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/__email__/g; my $email_c = 1; while($string =~ m/\b([a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/g) { my $em = $1; my $vl = "__email__".$email_c++; $string =~ s/\b$1\b/$vl/g; $acr_hash{$em} = $vl; } # Update reverse hash %rev_acr_hash = reverse %acr_hash; #print %acr_hash, "\n"; #print %rev_acr_hash, "\n"; # Handle ellipsis case .. w1....w2 .. etc. #$string =~ s/([^\p{IsN}])([\.]{3,})([^\p{IsN}])/$1 $2 $3/g; # Handle website or web page in text #$_ =~ s/\b((https?:\/\/|ftp:\/\/|file:\/\/)*[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\/?[a-zA-Z0-9_.\-]*)\b/__weblink__/; my $web_c = 1; while($string =~ m/\b((https?:\/\/|ftp:\/\/|file:\/\/|www)+[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\/?[a-zA-Z0-9_.\-]*)\b/g) { my $wl = $1; my $vl = "__weblink__".$web_c++; $string =~ s/\b$1\b/$vl/g; $acr_hash{$wl} = $vl; } # Update reverse hash %rev_acr_hash = reverse %acr_hash; # Handle ellipsis ... and .. $string =~ s/([\.]{3,})/ __ELLIP3__ /g; my $ellip3 = $1; $string =~ s/([\.]{2,2})/ __ELLIP2__ /g; my $ellip2 = $1; # Handle hyphens --- and -- $string =~ s/([\-]{3,})/ __HYPHEN3__ /g; my $hyphen3 = $1; $string =~ s/([\-]{2,2})/ __HYPHEN2__ /g; my $hyphen2 = $1; # Insert a space in text after . (dot) to handle acronym properly. # (don't put space in decimal number) $string =~ s/([^\p{IsN}])[\.]([^\p{IsN}])/$1\. $2/g; # to handle nA email: rashid101b@gmail.com. $string =~ s/(.*)([\.])$/$1 $2/g; # Seperate out "," except if within numbers (5,300) $string =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; # Seperate out "/" except if within numbers (1/2) $string =~ s/([^\p{IsN}])[\/]([^\p{IsN}])/$1 \/ $2/g; # Handle Sign of following $string =~ s/([^\p{IsN}])[:][-]$/$1 __SIGNF__/g; # Seperate out "-" except if within numbers (1-1-2013) [temporary to handle mwe ideal is w1 -JOIN w2 etc.] $string =~ s/([^\p{IsN}])( +)[-]([^\p{IsN}])/$1 __HYPHEN__ $3/g; $string =~ s/([^\p{IsN}])[-]( +)([^\p{IsN}])/$1 __HYPHEN__ $3/g; # Seperate out "-" except if within numbers (1-1-2013) [To handle mwe e.g. w1 -JOIN w2 etc.] #$string =~ s/([^\p{IsN}])([-]{1,1})([^\p{IsN}])/$1 -JOIN $3/g; if ($joinflag eq "" || lc($joinflag) eq "yes"){ $string =~ s/([^\p{IsN}])([-]{1,1})([^\"\'\(\[])([^\p{IsN}])/$1 -JOIN $3$4/g; } else { $string =~ s/([^\p{IsN}])([-]{1,1})([^\"\'\(\[])([^\p{IsN}])/$1 - $3$4/g; } #$string =~ s/([^\p{IsN}])([^ ])[-]([^ \(\[\"\'-])([^\p{IsN}])/$1$2 -JOIN $3$4/g; # Insert visarga(\u0903) inside a word if colon(\u003A or :) exist within words $string =~ s/([^\p{IsN}])[:]([^ \)\/])([^\p{IsN}])/$1\x{0903}$2$3/g; # Seperate out ":" except if within numbers (Mumbai: gateway of india.) $string =~ s/([^\p{IsN}])[:]([^\p{IsN}])/$1 : $2/g; # Separate multi comma in number 3,53,222 5,999.33 etc. $string =~ s/[,]( )/ ,$1/g; # Seperate out ";" $string =~ s/[;]( *)/ ;$1 /g; # turn `into ' $string =~ s/\`/ \' /g; # turn '' into " $string =~ s/\'\'/ \" /g; # put space around brackets $string =~ s/([\(\)\[\]\{\}])/ $1 /g; # put space around question word, end of the sentences ?, . | \u0964, # except | (vertical bar) to handle || end of the sentence #$_ =~ s/([\?\x{0964}])/ $1 /g; # clean up extraneous spaces $string =~ s/ +/ /g; $string =~ s/^ //g; $string =~ s/ $//g; # Apply acr_hash to handle acronym occur in text my ($key, $value, $line, $rev_line); for(split (" ", $string)) { if(exists $acr_hash{$_}) { s/$_/$acr_hash{$_}/; } $line = $line."$_ "; } $string = $line; # Print Acronym Marked String # print "Acronym Marked Output String: ", $string, "\n\n"; # put space around symbols except hyphen (-) for multiword and ! for exclemation $string =~ s/([\#\$\%\^\&\*\+\=\<\>\'\"\!])/ $1 /g; # pu space for urdu punctuation ‘ (\u2018) ’ (\u2019) ، (\u060C) $string =~ s/([\x{2018}\x{2019}\x{060C}])/ $1 /g; $string = " ".$string; #$string =~ s/([^\p{IsN}])([\.?\x{0964}|]+)([^\p{IsN}])/$1 $2<\/S>$3/g; # Sentence boundry marking improve by incorporating $3 to handle w1[.|?]" next sentence # urdu sentence end boundry added ۔ (\u06D4) ؟ (\u061F) $string =~ s/([^\p{IsDigit}])([\.?\x{06D4}\x{061F}\x{0964}|]+)( [\"\'])*([^\p{IsN}])/$1 $2$3<\/S>$4/g; if ($ellip3 ne ""){ $string =~ s/__ELLIP3__/$ellip3/g; } else { $string =~ s/__ELLIP3__/.../g; } if ($ellip2 ne ""){ $string =~ s/__ELLIP2__/$ellip2/g; } else { $string =~ s/__ELLIP2__/../g; } if ($hyphen3 ne ""){ $string =~ s/__HYPHEN3__/$hyphen3/g; } else{ $string =~ s/__HYPHEN3__/---/g; } if ($hyphen2 ne ""){ $string =~ s/__HYPHEN2__/$hyphen2/g; } else{ $string =~ s/__HYPHEN2__/--/g; } #print $string; #exit; $string =~ s/__HYPHEN__/-/g; $string =~ s/__SIGNF__/:-/g; $string =~ s/<\/S>\s*$/<\/S>/g; $string =~ s/(.*)([^(<\/S>)]\s)$/$1$2 <\/S>/g; $string =~ s/<\/S><\/S>$/<\/S>/g; # Apply reverse hash to store actual key occur in text for (split (" ", $string)) { if(exists $rev_acr_hash{$_}) { s/$_/$rev_acr_hash{$_}/; } $rev_line = $rev_line."$_ "; } $string = $rev_line; # Print Sentence Marked String #print "Sentence Marked String : ", $string, "\n\n"; return $string; } # Split Token based on blank space sub token_split { my $string = $_[0]; #print "Input to token_split function: $string\n"; my $sent_head = ""; my $sent_body = ""; my $sent_tail = ""; my $final_sent=""; my @sentarr = split ("<\/S>",$string); # sentences split foreach my $sent (0..$#sentarr){ $sentarr[$sent] =~ s/^\s*\s*//g; $sentarr[$sent] =~ s/\s*$//g; #print "mysent",$sentarr[$sent],"\n"; if ($sentarr[$sent] ne ""){ my $sentnewno = $sent+1; #print "\n"; $sent_head = "\n"; my @tknarr = split(" ", $sentarr[$sent]); foreach my $tkn (0..$#tknarr){ my $tknno = $tkn+1; if ($tknarr[$tkn] ne "" and $tknarr[$tkn] ne ""){ #print "$tkn\t$tknarr[$tkn]\tunk\n"; $sent_body = $sent_body."$tknno\t$tknarr[$tkn]\tunk\n"; } } #print "<\/Sentence>\n"; $sent_tail = "<\/Sentence>\n\n"; $final_sent = $final_sent.$sent_head.$sent_body.$sent_tail; # print "Final SSF : $final_sent"; $sent_body = ""; } } return $final_sent; } 1;