#! /usr/bin/perl # This program takes argument as file name which contains data in SSF format and outputs in BIO format. << '#'; Example of SSF and BIO format: SSF FORMAT: 0 (( SSF 1 (( NP 2 ikanuMci NN 3 )) 4 (( NP 5 nenu PRP 6 )) 7 (( NP 8 saBalU NN 9 , SYM 10 )) 11 (( NP 12 saMxarBalU NN 13 )) 14 (( VG 15 mAnukoVni VRB 16 )) 17 (( NP 18 iMtipattuna NN 19 )) 20 (( VG 21 uMdamanI VRB 22 )) 23 (( NP 24 xIni PRP 25 BAvaM NN 26 . SYM 27 )) BIO FORMAT: ikanuMci_NN B-NP nenu_PRP B-NP saBalU_NN B-NP ,_SYM I-NP saMxarBalU_NN B-NP mAnukoVni_VRB B-VG iMtipattuna_NN B-NP uMdamanI_VRB B-VG xIni_PRP B-NP BAvaM_NN I-NP ._SYM I-NP Given below are SSF format and its corresponding BIO-format i.e., TNT format contains two fields first is word and second is POS tag. psedo code: Just read each line and if the line has all space charcters then print OUT \n and continue else split the line with space(or tab ) and if the first word starts with < (sometimes in SSF format sentence ID is represented by "" so we have to ignore it)or second word is (( or )) (as seen above (( and )) have no information so we can ignore it)then it is unwanted line so ignore it and else print OUT 2nd word and 3rd word. # &ssftobio(); sub ssftobio { my $line; $flag=0; while ($line = <>) { chomp($line); if($line =~ /^\s*$/) # if the line has all space charcters { $flag=0; print "\n"; next; } if($line =~/^