VibhaktiSplitter.pm 8.96 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
package ILMT::PAN::HIN::VibhaktiSplitter;
#use strict;
#use warnings;
use Dir::Self;
use Data::Dumper;
use ILMT::PAN::HIN::SSFAPI::feature_filter;
use ILMT::PAN::HIN::SSFAPI::shakti_tree_api;

my $functionfile = __DIR__ . "/VibhaktiSplitter/data/functionwords_hin.txt";
my %rule_hash = ();

sub preprocess {
    open(RULE,$functionfile) or die "Couldn't open $functionfile for reading: $!\n";
    @rules = <RULE>;

    foreach $rule (@rules)
    {
        if($rule !~ /^\s*$/)
        {
            chomp($rule);
            ($fea,$array_in) = split(/\s+/,$rule);
            $rule_hash{ $fea } = $array_in;
        }
    }
}

sub process {
    my %par = @_;
    my $input = $par{'data'};
    utf8::encode($input);
    read_story(\$input);
    my $body;
    my $numBody = get_bodycount();
    my $result;

# will load all the file into memory
    $numBody = &get_bodycount();
    for(my($bodyNum)=1;$bodyNum<=$numBody;$bodyNum++)
    {

#Will get
        $body = &get_body($bodyNum,$body);

        my($numPara) = &get_paracount($body);

        for(my($i1)=1;$i1<=$numPara;$i1++)
        {

            my($para);
            $para = &get_para($i1);


            my($numSent) = &get_sentcount($para);

            for(my($j1)=1;$j1<=$numSent;$j1++)
            {
                my($sent) = &get_sent($para,$j1);
                my @all_children1 =&get_nodes(3,"NP",$sent);
                my @all_children2 =&get_nodes(3,"JJP",$sent);
                my @all_children3 =&get_nodes(3,"RBP",$sent);
                my @all_children4 =&get_nodes(3,"VGF",$sent);
                my @all_children5 =&get_nodes(3,"VNN",$sent);
                my @all_children6 =&get_nodes(3,"VGINF",$sent);
                my @all_children7 =&get_nodes(3,"VGNF",$sent);
                my @all_children8 =&get_nodes(3,"VGNN",$sent);
                @all_children = (@all_children1,@all_children2,@all_children3,@all_children4,@all_children5,@all_children6,@all_children7,@all_children8);
                $num = @all_children;
                for($i = 0; $i < $num; $i++)
                {
                    my @all_children1 =&get_nodes(3,"NP",$sent);
                    my @all_children2 =&get_nodes(3,"JJP",$sent);
                    my @all_children3 =&get_nodes(3,"RBP",$sent);
                    my @all_children4 =&get_nodes(3,"VGF",$sent);
                    my @all_children5 =&get_nodes(3,"VNN",$sent);
                    my @all_children6 =&get_nodes(3,"VGINF",$sent);
                    my @all_children7 =&get_nodes(3,"VGNF",$sent);
                    my @all_children8 =&get_nodes(3,"VGNN",$sent);
                    @all_children = (@all_children1,@all_children2,@all_children3,@all_children4,@all_children5,@all_children6,@all_children7,@all_children8);
                    $num = @all_children;

                    $node = $all_children[$i];

                    my(@childs_here) = &get_children($node,$sent);
                    $num_child_here = @childs_here;

                    my $val_fs=&get_field($node, 4,$sent);
                    my $chunk_name=&get_field($node, 3,$sent);
                    $FSreference = &read_FS($val_fs,$sent);
                    my @tams = &get_values("vib",$FSreference,$sent);
                    my @vposs = &get_values("vpos",$FSreference,$sent);

                    $vpos = $vposs[0];
                    $vpos =~ s/"//g;
                    $vpos =~ s/RP_//g;
                    $vpos =~ s/RP$//g;

                    (@new_words) = split(/_/,$tams[0]);
                    $num_words = @new_words;
                    (@new_posis) = split(/_/,$vpos);

                    $num_posis = @new_posis;


####----------------------- Removing the extra vib and tam For example, if there is "vib_4_tam" There shouldnt be two such things
                    $flag = 0;
                    for($p1 = 0; ($p1 < $num_posis) and ($num_posis != $new_words); $p1++)
                    {
                        $tmp = $new_posis[$p1];
                        if(($tmp =~ /vib/ or $tmp =~ /tam/) and $flag == 0)
                        {
                            $flag = 1;
                        }
                        elsif(($tmp =~ /vib/  or $tmp =~ /tam/) and $flag == 1)
                        {
#                                               print "Del $tmp\n";
                            #
#                                               delete $new_posis[$p1];
                            splice(@new_posis, $p1, $p1);
                            $p1--;
                            $num_posis = @new_posis;
                        }
                    }
                    $num_posis = @new_posis;


#                               if($num_words > $num_posis)
#                               {
#                                       $diff = $num_words - $num_posis;
#                                       $add = $num_child_here + $num_posis;
#                                       print $add."\n";
##                                      for($p1 = 0; $p1 < $diff; $p1++)
#                                       {
#                                               push(@new_posis,$add);
#                                               $add++;
#                                       }
#                               }
#                               print join('_',@new_posis)."\n";


                    $flag_tam = 0;
                    for($l = 0; $l < $num_posis; $l++)
                    {
                        $new_posi = $new_posis[$l];
                        if($new_posi =~ /tam/ or $new_posi =~ /vib/)
                        {
                            $flag_tam = 1;
                            last;
                        }
                    }
                    $num_posis = @new_posis;

                    for($l = 0; $l < $num_words; $l++)
                    {
                        $new_word = $new_words[$l];
                        $new_posi = $new_posis[$l];


                        if($new_posi =~ /tam/ or $new_posi =~ /vib/ or ($l == 0 and $flag_tam == 0))
                        {
                            $new_posi =~ s/[a-z]//g;
                            $new_posi =~ s/[A-Z]//g;
                            $new_posi += $node;

#                                               print $new_posi." is the pos\n";

                            my @per_chunk_arr=();
                            push @per_chunk_arr,$new_word;

                            my $val_fs =&get_field($new_posi, 4,$sent);
                            my $FSreference1 = &read_FS($val_fs,$sent);
                            &update_attr_val("vib", \@per_chunk_arr,$FSreference1,$sent);
                            my $string=&make_string($FSreference1,$sent);
                            &modify_field($new_posi,4,$string,$sent);
                        }
                        else
                        {
                            if($new_posi eq "")
                            {
                                my(@childs_here) = &get_children($node,$sent);
#                                                       print join('_',@childs_here)."\n";
                                $num_child_here = @childs_here;
                                $new_posi = $num_child_here + 1;
                            }
                            $bkp_posi = $new_posi;
                            $new_posi =~ s/[a-z]//g;
                            $new_posi =~ s/[A-Z]//g;
                            $new_posi += $node;

                            ($root,$tam) = split(/\+/,$new_word);
                            if($root eq "hE")
                            {
                                $tam = "hE";
                            }
                            if($root eq "WA")
                            {
                                $tam = "WA";
                            }
                            if($bkp_posi =~ /NEG/)
                            {
                                $pos1 = "NEG";
                                $dummy[2] = "<fs af=\'$root,adv,,,,,$tam,\'>";
                            }
                            elsif($rule_hash{$root} =~ /PSP/)
                            {
                                $pos1 = "PSP";
                                $dummy[2] = "<fs af=\'$root,psp,,,,,$tam,\'>";
                            }
                            elsif($rule_hash{$root} =~ /NST/)
                            {
                                $pos1 = "NST";
                                $dummy[2] = "<fs af=\'$root,psp,,,,,$tam,\'>";
                            }
                            else
                            {
                                $pos1 = "VAUX";
                                $dummy[2] = "<fs af=\'$root,v,,,,,$tam,\'>";
                            }

                            &add_leaf($new_posi,0,$new_word,$pos1,$dummy[2],$sent);
                        }
                    }
                }
            }
        }
    }

    open OUTFILE, '>', \$result  or die $!;
    select(OUTFILE);
    printstory();
    select(STDOUT);
    utf8::decode($result);
    return $result;
}

preprocess();

1;