convert_biotossf.pl 1.3 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
#! /usr/bin/perl

#	Report Bugs to prashanth@research.iiit.ac.in
#
#	Usage : perl convert-BIOtoSSF.pl < bio.txt > ssf.txt
#
#

my $line = "";
my $startFlag = 1;
my $wno = 1;
my $prevCTag = "";
my $error = "";
my $lno = 0;
my $sno = 1;
my $cno=0;

#scan each line from standard input
while($line = <STDIN>)
{
	$lno ++;
	if($line =~ /^\s*$/)
	{	# start of a sentence
		
		print "\t))\t\t\n";
		print "</Sentence>\n\n";
		$startFlag = 1;
		$wno = 1;
		$prevCTag = "";
		$sno ++;
		next;
	}

	if($startFlag == 1)
	{
		print "<Sentence id=\"$sno\">\n";
	}
	chomp($line);
	my @cols = split(/\s+/,$line);

	if($cols[3] =~ /^B-(\w+)/) 
	{
		my $ctag = $1;
		if($prevCTag ne "O" && $startFlag == 0)
		{
			print "\t))\t\t\n";
			$wno++;
		}
		$cno++;
		print "$cno\t((\t$ctag\t\n";
		$wno=1;
		$prevCTag = $ctag;
	}
	elsif($cols[3] =~ /^O/)
	{
		if($prevCTag ne "O" && $startFlag == 0)
		{
			print "\t))\t\t\n";
			$wno++;
		}
		$prevCTag = "O"; 
	}

	if($cols[3] =~ /I-(\w+)/ )
	{	# check for inconsistencies .. does not form a chunk if there r inconsistencies
		my $ctag = $1;
		if($ctag ne $prevCTag)
		{
			$error =$error . "Inconsistency of Chunk tag in I-$ctag at Line no:$lno : There is no B-$ctag to the prev. word\n";
		}
	}
	$cols[2]=~s/___/ /g;
	print "$cno.$wno\t$cols[0]\t$cols[1]\t$cols[2]\n";
	$wno ++;
	$startFlag = 0;
}