package ILMT::HIN::PAN::SimpleParser; #use strict; #use warnings; use Dir::Self; use Data::Dumper; use Log::Log4perl; use ILMT::HIN::PAN::SSFAPI::feature_filter; use ILMT::HIN::PAN::SSFAPI::shakti_tree_api; my $log_conf = "log4perl.rootLogger=OFF,LOGFILE log4perl.appender.LOGFILE=Log::Log4perl::Appender::File log4perl.appender.LOGFILE.filename= " . __DIR__ . "/SimpleParser/logs/parser.log log4perl.appender.LOGFILE.mode=append log4perl.appender.LOGFILE.layout=PatternLayout log4perl.appender.LOGFILE.layout.ConversionPattern=%d{dd:MM:yy:hh:mm:ss}[%-5p] %F{1}-%-5L : %m%n log4perl.appender.Screen.layout = Log::Log4perl::Layout::SimpleLayout "; my $feature_file = __DIR__ . "/SimpleParser/data/common/features"; my %parser_data; my @src_langs = ("hin"); sub preprocess { foreach my $src_lang (@src_langs) { open(RULES, __DIR__ . "/SimpleParser/data/$src_lang/rules_new_tagset.txt") or die("can't open $src_lang rulefile"); my @rules = ; @{$parser_data{$src_lang}{"rules"}} = @rules; close(RULES); } open(FEATURES, $feature_file) or die("can't open $feature_file"); foreach my $ii () { chomp($ii); my @fs_value=split(/\t/, $ii); $parser_data{"common_features"}{$fs_value[0]}=$fs_value[1]; } close(FEATURES); } sub process { my %par = @_; my $data = $par{'data'}; my $src_lang = $par{'src_lang'}; my $result; read_story(\$data); my $numBody = get_bodycount(); my $body; my @rules = @{$parser_data{$src_lang}{"rules"}}; init_parser($src_lang, \@rules, $parser_data{"common_features"}); for(my($bodyNum)=1;$bodyNum<=$numBody;$bodyNum++) { $body = get_body($bodyNum,$body); # Count the number of Paragraphs in the story my($numPara) = get_paracount($body); #print STDERR "Paras : $numPara\n"; # Iterate through paragraphs in the story for(my($i)=1;$i<=$numPara;$i++) { my($para); # Read Paragraph $para = get_para($i); # Count the number of sentences in this paragraph my($numSent) = get_sentcount($para); #print STDERR "Para Number $i, Num Sentences $numSent\n"; ##print $numSent."\n"; for(my($j)=1;$j<=$numSent;$j++) { #print " ... Processing sent $j\n"; # Read the sentence which is in SSF format my($sent) = get_sent($para,$j); #print STDERR "$sent"; #print_tree($sent); parser($sent); } } } open OUTFILE, '>', \$result or die $!; select(OUTFILE); printstory(); select(STDOUT); return $result; } my($i,$vi,@split_rule3,$size,$line6,%hash_tam,@lines6,$lines2,$x,@leaves,$flag4,$string15,$new_fs,$l,$flag3,$lines1,$file1,$ch,$f3,$k,$f4,@tree,@split_rule,$string4,@array2,$start_bound,$end_bound,$f5,$vibakti,$line,$karaka,%hash,$string,$filename,@leaves,$pos,$file, $file_output, @lines,@string2,@get_vib,@array4,$string3,$name,@split_arr1,$string_ref1,$name,@array5,$karaka1,$vibakti1,$string1,@array7, %fs_hash, @verb_k7, @split_tam, @split_dep, @split_drel, @split_mult, %hash6, $temp11, @verb_name, %mult_hash, %cost_hash, @split_cost, @split_vfn, @name_vfn, $string_name, $string_name_ref, @temp_name_vfn, %hash_k7, $lines_k7, $line_k7, %hash_k2, @rules, $log, $sent, $lang_data, $common_data); sub init_parser { my $slang = $_[0]; @rules = @{$_[1]}; %fs_hash = %{$_[2]}; $lang_data=__DIR__ . "/SimpleParser/data/$slang"; $common_data=__DIR__ . "/SimpleParser/data/common"; Log::Log4perl->init(\$log_conf); $log = Log::Log4perl->get_logger; } sub match_child { my $temp2 = $_[0]; my %temp_hash = %{$_[1]}; my $temp_ref = $_[2]; my @hist = split(/&&/, $temp2); $sent=$_[3]; my $y; my $y1; my $size_mdfr = @hist; my $m1; my $vib_flag=0; my $count_hist=0; foreach $m1(@hist) { $count_hist++; my @mdfr_const = split(/=/, $m1); if($temp_hash{$mdfr_const[0]} eq "fs" ) { my @hist1 = split(/\|/, $mdfr_const[1]); my $ite; foreach $ite(@hist1) { my @get_vib1 = get_values($mdfr_const[0], $temp_ref, $sent); my $vibhakti=$get_vib1[0]; if( ( ($get_vib1[0] =~ /.*$ite$/) && ($mdfr_const[0] eq "vib") ) || ( ($ite eq "NULL") && ($get_vib1[0] eq "") ) ) { $vib_flag=1; } elsif( ($get_vib1[0] =~/.*$ite.*/) && ($mdfr_const[0] eq "tam") ) { $vib_flag=1; } elsif( ( ($get_vib1[0] =~/$ite/) && ($mdfr_const[0] ne "vib") && ($mdfr_const[0] ne "tam") ) || ( ($ite eq "NULL") && ($get_vib1[0] eq "") ) ) { $vib_flag=1; } } } elsif($temp_hash{$mdfr_const[0]} eq "nonfs") { $vib_flag=0; my @list = get_values("lex", $temp_ref,$sent); my @list_vib = get_values("vib", $temp_ref,$sent); @leaves = get_children($i,$sent); my $flag = 1; $mdfr_const[1]="$lang_data/"."$mdfr_const[1]"; open(IN2, $mdfr_const[1]) or die("can't open the file $mdfr_const[1]"); $log->info("Reading $lang_data/$mdfr_const[1]"); while($lines2 = ) { chomp($lines2); if( ($lines2 eq $list[0]) || ($list_vib[0] =~/.*$lines2/) ) { #print "*******\n"; #print $lines2,"\n"; $vib_flag=1; } } } if($vib_flag == 0) { return($vib_flag); } } return($vib_flag); } sub mark_reln { my @temp_drel1; my @mod_tree = @{$_[0]}; my @temp_cost = @{$_[1]}; $sent=$_[2]; my @cost = $temp_cost[1]; my $s=-1; $flag4=0; my $var=0; my $i; foreach $i(@mod_tree) { my $flag=1; $s++; my $string = get_field($i, 4,$sent); my $string5 = get_field($i,3,$sent); my @modifier = split(/\|/, $split_rule[2]); my $i1; my $flag_mod=0; foreach $i1(@modifier) { if($string5 =~ /$i1/) { $flag_mod=1; } } if($flag_mod==1) { if( ( ($s>=$start_bound) && ($s<$end_bound) ) || ( ($split_rule[0] eq "NP") || ($split_rule[2] =~ /\bVG/) ) ) { my $string_fs1 = read_FS($string); my $ret_flag = match_child($split_rule[3], \%fs_hash, $string_fs1,$sent); @string2 = get_values("name", $string_fs1,$sent); if($string2[0]=~/\"/) { $string2[0]=~s/\"//g; #print"hello\n"; } @array4 = split(/:/, $string2[0]); chomp(@split_rule); my $temp = 1; if($ret_flag==1) { my $p = 0; foreach $k (@mod_tree) { $p++; $string4 = get_field($k, 3,$sent); if( ($string4 =~ /$split_rule[0]/) && ( ( ($end_bound>=$p) && ($p>$start_bound) ) || ( ($split_rule[0] eq "NP") || ($split_rule[2] =~ /\bVG/) )) && ($s<$p) ) { my $string5 = get_field($k, 4, $sent); my $string_ref5 = read_FS($string5, $sent); my @drel_verb; if($flag == 1) { #elsif($split_vfn[1] ne "vfn") # { @array5 = get_values("name", $string_ref5,$sent); if($array5[0]=~/\"/) { $array5[0]=~s/\"//g; #print"name--$array5[0]\n"; } @drel_verb = "$split_drel[1]_$array5[0]"; # } if($split_rule[0] =~ /VG/ && $split_vfn[1] eq "vfn") { my $string_cat = get_field($k, 3, $sent); if($string_cat =~ /VG/) { $string_name = get_field($k, 4,$sent); $string_name_ref = read_FS($string_name,$sent); @temp_name_vfn = get_values("name", $string_name_ref,$sent); if($temp_name_vfn[0]=~/\"/) { $temp_name_vfn[0]=~s/\"//g; #print"name---$temp_name_vfn[0]\n"; } $name_vfn[0] = $temp_name_vfn[0]; @drel_verb = "$split_drel[1]_$end_bound"; } } my $flag_parent = match_parent($split_rule[1], $string_ref5, \%fs_hash,$sent); if($flag_parent==1) { my @var1; my $myflag=0; my $flag_dep=0; if($split_dep[1] eq "X") { $flag_dep=1; } else { if($string =~ /$split_drel[0]/) { my @temp_drel = get_values($split_drel[0], $string_fs1,$sent); #print"drel==$temp_drel[0]\n"; if($temp_drel[0]=~/\"/) { $temp_drel[0]=~s/\"//g; #print"drel==$temp_drel[0]\n"; } @temp_drel1 = split(/:/, $temp_drel[0]); if($temp_drel1[0] eq "") { $myflag=1; } } else { $myflag=1; } if($myflag==1) { my @dep_verb; if ($drel_verb[0] =~ /$split_drel[1]/) { my @temp_dep_name = split(/__/, $drel_verb[0]); my @temp_dep_reln = "$split_dep[1]_$end_bound"; if( ($mult_hash{$temp_dep_reln[0]} eq "1") || ($mult_hash{$temp_dep_reln[0]} eq ">1") ) { $flag_dep=1; } } } } if ( ($array5[0] ne $string2[0] && $end_bound ne $string2[0] ) && $flag_dep==1) { if(($mult_hash{$drel_verb[0]} eq ">1") || ($mult_hash{$drel_verb[0]} eq "")) { $flag_dep=0; $flag=0; if($cost[0]==0) { my @temp_drel2 = get_values($split_drel[0], $string_fs1,$sent); if($temp_drel2[0]=~/\"/) { $temp_drel2[0]=~s/\"//g; #print"drel--$temp_drel2[0]\n"; } if($temp_drel2[0] ne "") { my @temp_drel3 = split(/:/, $temp_drel2[0]); my @temp_verb_drel3; @temp_verb_drel3 = "$temp_drel3[0]_$temp_drel3[1]"; if($split_vfn[1] eq "vfn") { @temp_verb_drel3 = "$temp_drel3[0]_$temp_drel3[1]"; } del_attr_val($split_drel[0], $string_fs1,$sent); $mult_hash{$temp_verb_drel3[0]} = ""; } my @var1; $var1[0] = "\"$split_drel[1]:$array5[0]\""; # $var1[0] = "$split_drel[1]:$array5[0]"; if($split_vfn[1] eq "vfn") { $var1[0] = "\"$split_drel[1]:$end_bound\""; #$var1[0] = "$split_drel[1]:$end_bound"; } add_attr_val($split_drel[0], \@var1, $string_fs1,$sent); $new_fs = make_string($string_fs1,$sent); modify_field($i, 4, $new_fs,$sent); #print"drel--$var1[0]\n"; $mult_hash{$drel_verb[0]} = $split_mult[1]; } elsif($cost[0]!=0) { my @cost_string = get_values("name", $string_fs1,$sent); if($cost_string[0]=~/\"/) { $cost_string[0]=~s/\"//g; } if($cost_hash{$split_drel[1]} eq "") { $cost_hash{$split_drel[1]} = "$cost_string[0]:$cost[0]"; } if($cost_hash{$split_drel[1]} ne "") { my @val_cst = split(/:/, $cost_hash{$split_drel[1]}); if($cost[0] >= $val_cst[1]) { $cost_hash{$split_drel[1]} = "$cost_string[0]:$cost[0]"; my @drel1 = get_values($split_drel[0], $string_fs1,$sent); if($drel1[0]=~/"/) { $drel1[0]=~s/"//g; } #my @drel1; # $drel1[0] = $split_drel[1]; if($drel1[0] ne "") { my @drel2 = split(/:/, $drel1[0]); my @temp_drel_verb1; @temp_drel_verb1 = "$drel2[0]_$drel2[1]"; if($split_vfn[1] eq "vfn") { @temp_drel_verb1 = "$drel2[0]_$end_bound"; } del_attr_val($split_drel[0], $string_fs1,$sent,$sent); $mult_hash{$temp_drel_verb1[0]} = ""; } my @var2; $var2[0] = "\"$split_drel[1]:$array5[0]\""; if($split_vfn[1] eq "vfn") { $var2[0] = "\"$split_drel[1]:$end_bound\""; } add_attr_val($split_drel[0], \@var2, $string_fs1,$sent); $new_fs = make_string($string_fs1,$sent); modify_field($i, 4, $new_fs,$sent); $mult_hash{$drel_verb[0]} = $split_mult[1]; } } } } } } } } } } } } } } sub match_parent { my $par_flag=0; my $par_hist=$_[0]; my $string_refnce=$_[1]; my %temp_hash = %{$_[2]}; $sent=$_[3]; my @par_hist1 = split(/&&/, $par_hist); my %hash_karaka = (); my $ite1; foreach $ite1(@par_hist1) { my @mdfd_const = split(/=/, $ite1); my $tama; if($temp_hash{$mdfd_const[0]} eq "fs") { my @mdfd_const_values = split(/\|/, $mdfd_const[1]); my $ite_pipe; for $ite_pipe(@mdfd_const_values) { #print $ite_pipe,"\n"; my @array6 = get_values($mdfd_const[0], $string_refnce, $sent); #$array6[0]=~s/"//g; my $c = $array6[0]; my @fs_list; my $fs_list1; if($ite_pipe ne "X") { @fs_list = split(/__/, $ite_pipe); $fs_list1 = $fs_list[0]; $fs_list[0]="$lang_data/".$fs_list[0]; if (-e $fs_list[0]) { open(IN3, $fs_list[0]) or die("can't open the $fs_list[0]"); $log->info("Reading $fs_list[0]"); @lines6 = ; chomp(@lines6); foreach $line6 (@lines6) { chomp($line6); my @tam = split(/\t/, $line6); $hash_tam{$tam[0]} = $tam[1]; } $tama = $hash_tam{$c}; } } if(($tama eq $fs_list[1]) || ($ite_pipe eq "X") || ($fs_list1 eq $c)) { $par_flag=1; } } } elsif($temp_hash{$mdfd_const[0]} eq "nonfs") { $par_flag=0; if($mdfd_const[0] eq "list") { my @par_list = split(/__/, $mdfd_const[1]); my @lex = get_values("lex", $string_refnce,$sent); $par_list[0]="$lang_data/".$par_list[0]; open(IN7, $par_list[0]) or die("can't open the $par_list[0]"); $log->info("Reading $lang_data/$par_list[0]"); my @lines_k2=; my $line_k2; my @verb_k2; chomp(@lines_k2); foreach $line_k2 (@lines_k2) { chomp($line_k2); @verb_k2 = split(/\t/, $line_k2); $hash_k2{$verb_k2[0]} = $verb_k2[1]; } %hash_karaka = %hash_k2; if($verb_k2[1] eq "") { open(IN8, $par_list[0]) or die("can't open the $par_list[0]"); $log->info("Reading $lang_data/$par_list[0]"); my $ll; while($ll = ) { chomp($ll); if( ($ll eq $lex[0])) { $par_flag=1; } } } my $lex_value = $hash_karaka{$lex[0]}; if((/.*$lex_value/ =~ /.*$par_list[1]/) || ($lex_value eq "trans")) { $par_flag=1; } } } if($par_flag == 0) { return($par_flag); } } return ($par_flag); } sub parser { $temp11=0; my $j=0; $sent=$_[0]; @tree = get_children(0, $sent); my $k; $vi=0; my $var=0; $size=@tree; foreach $i(@tree) { $string = get_field($i, 4,$sent); @leaves = get_children($i,$sent); my $string_fs = read_FS($string,$sent); del_attr_val("name", $string_fs); #my @drel_ccof = get_values("drel", $string_fs); #my @drel_ccof_core = split(/:/, $drel_ccof[0]); #if($drel_ccof_core[0] ne "ccof") #{ del_attr_val("drel", $string_fs,$sent); del_attr_val("srel", $string_fs,$sent); del_attr_val("role", $string_fs,$sent); del_attr_val("trel", $string_fs,$sent); $new_fs = make_string($string_fs,$sent); modify_field($i, 4, $new_fs,$sent); #} } foreach $i(@tree) { $j=$j+1; my $string = get_field($i,4,$sent); my $string_fs = read_FS($string, $sent); my @var1; $var1[0]="\"$j\""; add_attr_val("name", \@var1, $string_fs, $sent); $new_fs = make_string($string_fs, $sent); modify_field($i, 4, $new_fs, $sent); } my $flag2=0; foreach $l (@tree) { my $string_node = get_field($l, 3,$sent); $var++; if($string_node eq "VG" || $string_node =~ /VGF/) { my $string14 = get_field($l, 4, $sent); my $string_ref3 = read_FS($string14, $sent); $flag3=1; @leaves = get_children($l,$sent); foreach $x (@leaves) { $string15 = get_field($x, 3,$sent); if( ( ($string15 eq "VM" || $string15 eq "VFM") || ($var == $size) ) && ($flag3==1) ) { %mult_hash = (); %cost_hash = (); @verb_name = get_values("name", $string_ref3, $sent); $verb_name[0]=~s/\"//g; $start_bound = $temp11; $end_bound = $verb_name[0]; foreach my $r (@rules) { chomp($r); @split_rule=split(/\t/, $r); my $flag=1; @split_tam = split(/=/,$split_rule[1]); @split_drel = split(/=/,$split_rule[4]); @split_dep = split(/=/,$split_rule[5]); @split_mult = split(/=/,$split_rule[6]); @split_cost = split(/=/,$split_rule[7]); @split_vfn = split(/=/,$split_rule[8]); mark_reln(\@tree, \@split_cost,$sent); %hash_k2 = (); %hash_tam = (); } $temp11 = $end_bound; } $flag3=0; } } } #foreach my $hash_key(keys %mult_hash) # { # print $hash_key, " == ", $mult_hash{$hash_key}, "\n"; # } #print_tree_file($file_output); %mult_hash = (); %cost_hash = (); } #} preprocess(); 1;