ssf2bio.pl 2.43 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
#! /usr/bin/perl


# This program takes argument as file name which contains data in SSF format and outputs in BIO format.

<< '#';
 Example of SSF and BIO format:

 SSF FORMAT:

<Sentence id="1">
0       ((      SSF
1       ((      NP
2       ikanuMci        NN
3       ))
4       ((      NP
5       nenu    PRP
6       ))
7       ((      NP
8       saBalU  NN
9       ,       SYM
10      ))
11      ((      NP
12      saMxarBalU      NN
13      ))
14      ((      VG
15      mAnukoVni       VRB
16      ))
17      ((      NP
18      iMtipattuna     NN
19      ))
20      ((      VG
21      uMdamanI        VRB
22      ))
23      ((      NP
24      xIni    PRP
25      BAvaM   NN
26      .       SYM
27      ))
</Sentence>

 BIO FORMAT:
 
ikanuMci_NN      B-NP
nenu_PRP     B-NP
saBalU_NN      B-NP
,_SYM     I-NP
saMxarBalU_NN      B-NP
mAnukoVni_VRB     B-VG
iMtipattuna_NN      B-NP
uMdamanI_VRB     B-VG
xIni_PRP     B-NP
BAvaM_NN      I-NP
._SYM     I-NP
  
 
Given below are SSF format and its corresponding BIO-format
i.e., TNT format contains two fields first is word and second is POS tag.

 psedo code:
Just read each line and if the line has all space charcters then print OUT \n and continue else split the line with space(or tab ) and if the first word starts with < (sometimes in SSF format sentence ID is represented by "<Sentence Id=1>" so we have to ignore it)or second word is (( or )) (as seen above (( and )) have no information so we can ignore it)then it is unwanted line so ignore it and else print OUT 2nd word and 3rd word.

#

&ssftobio();
sub ssftobio
{

	my $line;
	$flag=0;
	while ($line = <>)
	{
		chomp($line);
		if($line =~ /^\s*$/)  # if the line has all space charcters 
		{
			$flag=0;
			print "\n";
			next;
		}
		if($line =~/^</)
		{
			next;
		}
		my ($att1,$att2,$att3,$att4) = split (/\t+/, $line); #spliting the line using tabs..
		if($att2 eq "((" and $att1!~/[0-9]+\.[0-9]+/) #unwanted lines
		{
			$flag=1;
			$chunk=$att3;
		}
		elsif($att2 eq "))" and  $att1!~/[0-9]+\.[0-9]+/)  #unwanted lines
		{
			$flag=0;
			next;
		}
		else #print OUTing the wanted lines.
		{
			if($att4 =~/[A-Za-z][0-9]/)
			{
				my @array=split(/\,/,$att4);
				my ($garb,$root)=split(/\=/,$array[0]);
				if($flag){print $att2,"_",$att3,"\tB-",$chunk;}
				else{print $att2,"_",$att3,"\tI-",$chunk;}
			}
			else
			{
				if($flag){print $att2,"_",$att3,"\tB-",$chunk;}
				else{print $att2,"_",$att3,"\tI-",$chunk;}
			}
			print "\n";
			$flag=0;
		}
	}
}