#!/usr/bin/perl #use strict; use Getopt::Long; use Log::Log4perl; GetOptions("help!"=>\$help,"path=s"=>\$parser_home,"rulefile=s"=>\$rulefile,"input=s"=>\$input,"output:s"=>\$output,"version"=>\$version,"logFile=s"=>\$logFile, "sLang=s"=>\$slang); print "Unprocessed by Getopt::Long\n" if $ARGV[0]; foreach (@ARGV) { print "$_\n"; exit(0); } if($help eq 1) { print "SIMPLE PARSER - Parser Version 1.1.1\n (14th July 2009 last modified on 21st July 2009)\n\n"; print "usage : perl \$PWD/simple_parser.pl --path=\"\$PWD\" --rulefile=\"rules_file\" --logFile=\"log file\" --input=\"input_file\" --sLang=\"Source Language\"\n"; print "\tOutput is printed to STDOUT, it can be redirected to an output file\n"; exit(0); } if($version eq 1) { print "SIMPLE PARSER - Parser Version 1.1.1\n (14th July 2009 last modified on 21st July 2009)\n\n"; exit(0); } if($parser_home eq "") { print "Please Specify the Paramaters and Paths as defined in --help\n"; exit(0); } #if($rulefile eq "") #{ # print "Please Specify the Rule file\n"; # exit(0); #} if($slang eq "") { print "Please Specify the Source Language\n"; exit(0); } $api="$parser_home/common/API"; $data_src="$parser_home/data_src"; $lang_data="$parser_home/../../../data_bin/sl/simple_parser/$slang"; $common_data="$parser_home/../../../data_bin/sl/simple_parser/common"; require "$api/shakti_tree_api.pl"; require "$api/feature_filter.pl"; my $logfile = "$parser_home/log.conf"; Log::Log4perl->init($logfile); $log = Log::Log4perl->get_logger; sub logfile { my $myLog = $logFile || "simple_parser.log"; return $myLog; } if ($input eq "") { $input="/dev/stdin"; } my($i,$vi,@split_rule3,$size,$line6,%hash_tam,@lines6,$lines2,$x,@leaves,$flag4,$string15,$new_fs,$l,$flag3,$lines1,$file1,$ch,$f3,$k,$f4,@tree,@split_rule,$string4,@array2,$start_bound,$end_bound,$f5,$vibakti,$line,$karaka,%hash,$string,$filename,@leaves,$pos,$file, $file_output, @lines,@string2,@get_vib,@array4,$string3,$name,@split_arr1,$string_ref1,$name,@array5,$karaka1,$vibakti1,$string1,@array7, %fs_hash, @verb_k7, @split_tam, @split_dep, @split_drel, @split_mult, %hash6, $temp11, @verb_name, %mult_hash, %cost_hash, @split_cost, @split_vfn, @name_vfn, $string_name, $string_name_ref, @temp_name_vfn, %hash_k7, $lines_k7, $line_k7, %hash_k2); open(IN, $input) or die("can't open $input"); $log->info("Reading Input $input"); open(IN1, $rulefile); #or die("can't open $rulefile"); $log->info("Reading Rule File $rulefile"); my @rules = ; open(IN4, "$common_data/features") or die("can't open the features"); $log->info("Reading Features $common_data/features"); my @fs_lines=; my $ii; foreach $ii(@fs_lines) { chomp($ii); my @fs_value=split(/\t/, $ii); $fs_hash{$fs_value[0]}=$fs_value[1]; } sub match_child { my $temp2 = $_[0]; my %temp_hash = %{$_[1]}; my $temp_ref = $_[2]; my @hist = split(/&&/, $temp2); $sent=$_[3]; my $y; my $y1; my $size_mdfr = @hist; my $m1; my $vib_flag=0; my $count_hist=0; foreach $m1(@hist) { $count_hist++; my @mdfr_const = split(/=/, $m1); if($temp_hash{$mdfr_const[0]} eq "fs" ) { my @hist1 = split(/\|/, $mdfr_const[1]); my $ite; foreach $ite(@hist1) { my @get_vib1 = &get_values($mdfr_const[0], $temp_ref, $sent); my $vibhakti=$get_vib1[0]; if( ( ($get_vib1[0] =~ /.*$ite$/) && ($mdfr_const[0] eq "vib") ) || ( ($ite eq "NULL") && ($get_vib1[0] eq "") ) ) { $vib_flag=1; } elsif( ($get_vib1[0] =~/.*$ite.*/) && ($mdfr_const[0] eq "tam") ) { $vib_flag=1; } elsif( ( ($get_vib1[0] =~/$ite/) && ($mdfr_const[0] ne "vib") && ($mdfr_const[0] ne "tam") ) || ( ($ite eq "NULL") && ($get_vib1[0] eq "") ) ) { $vib_flag=1; } } } elsif($temp_hash{$mdfr_const[0]} eq "nonfs") { $vib_flag=0; my @list = &get_values("lex", $temp_ref,$sent); my @list_vib = &get_values("vib", $temp_ref,$sent); @leaves = &get_children($i,$sent); my $flag = 1; $mdfr_const[1]="$lang_data/"."$mdfr_const[1]"; open(IN2, $mdfr_const[1]) or die("can't open the file $mdfr_const[1]"); $log->info("Reading $lang_data/$mdfr_const[1]"); while($lines2 = ) { chomp($lines2); if( ($lines2 eq $list[0]) || ($list_vib[0] =~/.*$lines2/) ) { #print "*******\n"; #print $lines2,"\n"; $vib_flag=1; } } } if($vib_flag == 0) { return($vib_flag); } } return($vib_flag); } sub mark_reln { my @temp_drel1; my @mod_tree = @{$_[0]}; my @temp_cost = @{$_[1]}; $sent=$_[2]; my @cost = $temp_cost[1]; my $s=-1; $flag4=0; my $var=0; my $i; foreach $i(@mod_tree) { my $flag=1; $s++; my $string = &get_field($i, 4,$sent); my $string5 = &get_field($i,3,$sent); my @modifier = split(/\|/, $split_rule[2]); my $i1; my $flag_mod=0; foreach $i1(@modifier) { if($string5 =~ /$i1/) { $flag_mod=1; } } if($flag_mod==1) { if( ( ($s>=$start_bound) && ($s<$end_bound) ) || ( ($split_rule[0] eq "NP") || ($split_rule[2] =~ /\bVG/) ) ) { my $string_fs1 = &read_FS($string); my $ret_flag = &match_child($split_rule[3], \%fs_hash, $string_fs1,$sent); @string2 = &get_values("name", $string_fs1,$sent); if($string2[0]=~/\"/) { $string2[0]=~s/\"//g; #print"hello\n"; } @array4 = split(/:/, $string2[0]); chomp(@split_rule); my $temp = 1; if($ret_flag==1) { my $p = 0; foreach $k (@mod_tree) { $p++; $string4 = &get_field($k, 3,$sent); if( ($string4 =~ /$split_rule[0]/) && ( ( ($end_bound>=$p) && ($p>$start_bound) ) || ( ($split_rule[0] eq "NP") || ($split_rule[2] =~ /\bVG/) )) && ($s<$p) ) { my $string5 = &get_field($k, 4, $sent); my $string_ref5 = &read_FS($string5, $sent); my @drel_verb; if($flag == 1) { #elsif($split_vfn[1] ne "vfn") # { @array5 = &get_values("name", $string_ref5,$sent); if($array5[0]=~/\"/) { $array5[0]=~s/\"//g; #print"name--$array5[0]\n"; } @drel_verb = "$split_drel[1]_$array5[0]"; # } if($split_rule[0] =~ /VG/ && $split_vfn[1] eq "vfn") { my $string_cat = &get_field($k, 3, $sent); if($string_cat =~ /VG/) { $string_name = &get_field($k, 4,$sent); $string_name_ref = &read_FS($string_name,$sent); @temp_name_vfn = &get_values("name", $string_name_ref,$sent); if($temp_name_vfn[0]=~/\"/) { $temp_name_vfn[0]=~s/\"//g; #print"name---$temp_name_vfn[0]\n"; } $name_vfn[0] = $temp_name_vfn[0]; @drel_verb = "$split_drel[1]_$end_bound"; } } my $flag_parent = &match_parent($split_rule[1], $string_ref5, \%fs_hash,$sent); if($flag_parent==1) { my @var1; my $myflag=0; my $flag_dep=0; if($split_dep[1] eq "X") { $flag_dep=1; } else { if($string =~ /$split_drel[0]/) { my @temp_drel = &get_values($split_drel[0], $string_fs1,$sent); #print"drel==$temp_drel[0]\n"; if($temp_drel[0]=~/\"/) { $temp_drel[0]=~s/\"//g; #print"drel==$temp_drel[0]\n"; } @temp_drel1 = split(/:/, $temp_drel[0]); if($temp_drel1[0] eq "") { $myflag=1; } } else { $myflag=1; } if($myflag==1) { my @dep_verb; if ($drel_verb[0] =~ /$split_drel[1]/) { my @temp_dep_name = split(/__/, $drel_verb[0]); my @temp_dep_reln = "$split_dep[1]_$end_bound"; if( ($mult_hash{$temp_dep_reln[0]} eq "1") || ($mult_hash{$temp_dep_reln[0]} eq ">1") ) { $flag_dep=1; } } } } if ( ($array5[0] ne $string2[0] && $end_bound ne $string2[0] ) && $flag_dep==1) { if(($mult_hash{$drel_verb[0]} eq ">1") || ($mult_hash{$drel_verb[0]} eq "")) { $flag_dep=0; $flag=0; if($cost[0]==0) { my @temp_drel2 = &get_values($split_drel[0], $string_fs1,$sent); if($temp_drel2[0]=~/\"/) { $temp_drel2[0]=~s/\"//g; #print"drel--$temp_drel2[0]\n"; } if($temp_drel2[0] ne "") { my @temp_drel3 = split(/:/, $temp_drel2[0]); my @temp_verb_drel3; @temp_verb_drel3 = "$temp_drel3[0]_$temp_drel3[1]"; if($split_vfn[1] eq "vfn") { @temp_verb_drel3 = "$temp_drel3[0]_$temp_drel3[1]"; } &del_attr_val($split_drel[0], $string_fs1,$sent); $mult_hash{$temp_verb_drel3[0]} = ""; } my @var1; $var1[0] = "\"$split_drel[1]:$array5[0]\""; # $var1[0] = "$split_drel[1]:$array5[0]"; if($split_vfn[1] eq "vfn") { $var1[0] = "\"$split_drel[1]:$end_bound\""; #$var1[0] = "$split_drel[1]:$end_bound"; } &add_attr_val($split_drel[0], \@var1, $string_fs1,$sent); $new_fs = &make_string($string_fs1,$sent); &modify_field($i, 4, $new_fs,$sent); #print"drel--$var1[0]\n"; $mult_hash{$drel_verb[0]} = $split_mult[1]; } elsif($cost[0]!=0) { my @cost_string = &get_values("name", $string_fs1,$sent); if($cost_string[0]=~/\"/) { $cost_string[0]=~s/\"//g; } if($cost_hash{$split_drel[1]} eq "") { $cost_hash{$split_drel[1]} = "$cost_string[0]:$cost[0]"; } if($cost_hash{$split_drel[1]} ne "") { my @val_cst = split(/:/, $cost_hash{$split_drel[1]}); if($cost[0] >= $val_cst[1]) { $cost_hash{$split_drel[1]} = "$cost_string[0]:$cost[0]"; my @drel1 = &get_values($split_drel[0], $string_fs1,$sent); if($drel1[0]=~/"/) { $drel1[0]=~s/"//g; } #my @drel1; # $drel1[0] = $split_drel[1]; if($drel1[0] ne "") { my @drel2 = split(/:/, $drel1[0]); my @temp_drel_verb1; @temp_drel_verb1 = "$drel2[0]_$drel2[1]"; if($split_vfn[1] eq "vfn") { @temp_drel_verb1 = "$drel2[0]_$end_bound"; } &del_attr_val($split_drel[0], $string_fs1,$sent,$sent); $mult_hash{$temp_drel_verb1[0]} = ""; } my @var2; $var2[0] = "\"$split_drel[1]:$array5[0]\""; if($split_vfn[1] eq "vfn") { $var2[0] = "\"$split_drel[1]:$end_bound\""; } &add_attr_val($split_drel[0], \@var2, $string_fs1,$sent); $new_fs = &make_string($string_fs1,$sent); &modify_field($i, 4, $new_fs,$sent); $mult_hash{$drel_verb[0]} = $split_mult[1]; } } } } } } } } } } } } } } sub match_parent { my $par_flag=0; my $par_hist=$_[0]; my $string_refnce=$_[1]; my %temp_hash = %{$_[2]}; $sent=$_[3]; my @par_hist1 = split(/&&/, $par_hist); my %hash_karaka = (); my $ite1; foreach $ite1(@par_hist1) { my @mdfd_const = split(/=/, $ite1); my $tama; if($temp_hash{$mdfd_const[0]} eq "fs") { my @mdfd_const_values = split(/\|/, $mdfd_const[1]); my $ite_pipe; for $ite_pipe(@mdfd_const_values) { #print $ite_pipe,"\n"; my @array6 = &get_values($mdfd_const[0], $string_refnce, $sent); #$array6[0]=~s/"//g; my $c = $array6[0]; my @fs_list; if($ite_pipe ne "X") { @fs_list = split(/__/, $ite_pipe); $fs_list1 = $fs_list[0]; $fs_list[0]="$lang_data/".$fs_list[0]; if (-e $fs_list[0]) { open(IN3, $fs_list[0]) or die("can't open the $fs_list[0]"); $log->info("Reading $fs_list[0]"); @lines6 = ; chomp(@lines6); foreach $line6 (@lines6) { chomp($line6); my @tam = split(/\t/, $line6); $hash_tam{$tam[0]} = $tam[1]; } $tama = $hash_tam{$c}; } } if(($tama eq $fs_list[1]) || ($ite_pipe eq "X") || ($fs_list1 eq $c)) { $par_flag=1; } } } elsif($temp_hash{$mdfd_const[0]} eq "nonfs") { $par_flag=0; if($mdfd_const[0] eq "list") { my @par_list = split(/__/, $mdfd_const[1]); my @lex = &get_values("lex", $string_refnce,$sent); $par_list[0]="$lang_data/".$par_list[0]; open(IN7, $par_list[0]) or die("can't open the $par_list[0]"); $log->info("Reading $lang_data/$par_list[0]"); my @lines_k2=; my $line_k2; my @verb_k2; chomp(@lines_k2); foreach $line_k2 (@lines_k2) { chomp($line_k2); @verb_k2 = split(/\t/, $line_k2); $hash_k2{$verb_k2[0]} = $verb_k2[1]; } %hash_karaka = %hash_k2; if($verb_k2[1] eq "") { open(IN8, $par_list[0]) or die("can't open the $par_list[0]"); $log->info("Reading $lang_data/$par_list[0]"); my $ll; while($ll = ) { chomp($ll); if( ($ll eq $lex[0])) { $par_flag=1; } } } my $lex_value = $hash_karaka{$lex[0]}; if((/.*$lex_value/ =~ /.*$par_list[1]/) || ($lex_value eq "trans")) { $par_flag=1; } } } if($par_flag == 0) { return($par_flag); } } return ($par_flag); } @lines = $input; #@lines = ; #foreach $line (@lines) #{ my $var=0; #chomp($line); $file = $lines[0]; #$file_output = "$lines"; &read_story($file); $numBody = &get_bodycount(); for(my($bodyNum)=1;$bodyNum<=$numBody;$bodyNum++) { $body = &get_body($bodyNum,$body); # Count the number of Paragraphs in the story my($numPara) = &get_paracount($body); #print STDERR "Paras : $numPara\n"; # Iterate through paragraphs in the story for(my($i)=1;$i<=$numPara;$i++) { my($para); # Read Paragraph $para = &get_para($i); # Count the number of sentences in this paragraph my($numSent) = &get_sentcount($para); #print STDERR "Para Number $i, Num Sentences $numSent\n"; ##print $numSent."\n"; for(my($j)=1;$j<=$numSent;$j++) { #print " ... Processing sent $j\n"; # Read the sentence which is in SSF format my($sent) = &get_sent($para,$j); #print STDERR "$sent"; #&print_tree($sent); &parser($sent); } } } sub parser { $temp11=0; my $j=0; $sent=$_[0]; @tree = &get_children(0, $sent); my $k; $vi=0; my $var=0; $size=@tree; foreach $i(@tree) { $string = &get_field($i, 4,$sent); @leaves = &get_children($i,$sent); my $string_fs = &read_FS($string,$sent); &del_attr_val("name", $string_fs); #my @drel_ccof = &get_values("drel", $string_fs); #my @drel_ccof_core = split(/:/, $drel_ccof[0]); #if($drel_ccof_core[0] ne "ccof") #{ &del_attr_val("drel", $string_fs,$sent); &del_attr_val("srel", $string_fs,$sent); &del_attr_val("role", $string_fs,$sent); &del_attr_val("trel", $string_fs,$sent); $new_fs = &make_string($string_fs,$sent); &modify_field($i, 4, $new_fs,$sent); #} } foreach $i(@tree) { $j=$j+1; my $string = &get_field($i,4,$sent); my $string_fs = &read_FS($string, $sent); my @var1; $var1[0]="\"$j\""; &add_attr_val("name", \@var1, $string_fs, $sent); $new_fs = &make_string($string_fs, $sent); &modify_field($i, 4, $new_fs, $sent); } my $flag2=0; foreach $l (@tree) { my $string_node = &get_field($l, 3,$sent); $var++; if($string_node eq "VG" || $string_node =~ /VGF/) { my $string14 = &get_field($l, 4, $sent); my $string_ref3 = &read_FS($string14, $sent); $flag3=1; @leaves = &get_children($l,$sent); foreach $x (@leaves) { $string15 = &get_field($x, 3,$sent); if( ( ($string15 eq "VM" || $string15 eq "VFM") || ($var == $size) ) && ($flag3==1) ) { %mult_hash = (); %cost_hash = (); @verb_name = &get_values("name", $string_ref3, $sent); $verb_name[0]=~s/\"//g; $start_bound = $temp11; $end_bound = $verb_name[0]; foreach my $r (@rules) { chomp($r); @split_rule=split(/\t/, $r); my $flag=1; @split_tam = split(/=/,$split_rule[1]); @split_drel = split(/=/,$split_rule[4]); @split_dep = split(/=/,$split_rule[5]); @split_mult = split(/=/,$split_rule[6]); @split_cost = split(/=/,$split_rule[7]); @split_vfn = split(/=/,$split_rule[8]); &mark_reln(\@tree, \@split_cost,$sent); %hash_k2 = (); %hash_tam = (); } $temp11 = $end_bound; } $flag3=0; } } } #foreach my $hash_key(keys %mult_hash) # { # print $hash_key, " == ", $mult_hash{$hash_key}, "\n"; # } #&print_tree_file($file_output); %mult_hash = (); %cost_hash = (); } if($output ne "") { &printstory_file("$output"); } else { &printstory(); } #}