Commit 336b31bb authored by pruthwik mishra's avatar pruthwik mishra

Conversion of pos tags from BIS to ILMT

parent 6ae59504
from re import search
from sys import argv
def bis_to_ilmt_conversion(bis_pos_tag):
"""Convert a BIS pos tag to ILMT pos tag."""
ilmt_tag = ''
bis_pos_tag = bis_pos_tag.replace('__', '_')
if bis_pos_tag == 'N_NNV':
ilmt_tag = 'VM'
elif search('N_(N.*)', bis_pos_tag):
print(search('N_(N.*)', bis_pos_tag))
ilmt_tag = search('N_(N.*)', bis_pos_tag).group(1)
elif bis_pos_tag == 'PR_PRQ':
ilmt_tag = 'WQ'
elif search('PR_PR.*', bis_pos_tag):
ilmt_tag = 'PRP'
elif search('DM_.*', bis_pos_tag):
ilmt_tag = 'DEM'
elif search('V_VM.*', bis_pos_tag):
ilmt_tag = 'VM'
elif bis_pos_tag == 'JJ':
ilmt_tag = 'JJ'
elif bis_pos_tag == 'RB':
ilmt_tag = 'RB'
elif bis_pos_tag == 'PSP':
ilmt_tag = 'PSP'
elif search('V_VAUX', bis_pos_tag):
ilmt_tag = 'VAUX'
elif bis_pos_tag == 'CC_CCS_UT':
ilmt_tag = 'UT'
elif search('CC.*', bis_pos_tag):
ilmt_tag = 'CC'
elif bis_pos_tag == 'RP_RPD':
ilmt_tag = 'RP'
elif search('RP_(.+)', bis_pos_tag):
ilmt_tag = search('RP_(.+)', bis_pos_tag).group(1)
elif search('QT.*', bis_pos_tag):
if bis_pos_tag[-1] == 'F':
ilmt_tag = 'QF'
elif bis_pos_tag[-1] == 'C':
ilmt_tag = 'QC'
elif bis_pos_tag[-1] == 'O':
ilmt_tag = 'QO'
elif bis_pos_tag in ['RD_PUNC', 'RD_SYM']:
ilmt_tag = 'SYM'
elif bis_pos_tag == 'RD_ECH':
ilmt_tag = 'ECH'
elif bis_pos_tag == 'RD_RDF':
ilmt_tag = 'FW'
elif bis_pos_tag == 'RD_UNK':
ilmt_tag = 'UNK'
if not ilmt_tag:
print('BLANK')
ilmt_tag = 'NN'
return ilmt_tag
def read_lines_from_file(file_path):
"""Read lines from a file."""
with open(file_path, 'r', encoding='utf-8') as file_read:
return file_read.readlines()
def write_lines_into_file(lines, file_path):
"""Write lines into a file."""
with open(file_path, 'w', encoding='utf-8') as file_write:
file_write.write('\n'.join(lines))
def convert_pos_tags_from_bis_to_ilmt(lines):
"""Convert BIS pos tags into ILMT tags."""
converted_lines = list()
for line in lines:
if line.strip():
line_split = line.strip().split('\t')
ilmt_tag = bis_to_ilmt_conversion(line_split[1])
if len(line_split) == 2:
converted_lines.append('\t'.join([line_split[0], ilmt_tag]))
else:
converted_lines.append('\t'.join([line_split[0], ilmt_tag, line_split[2]]))
else:
converted_lines.append(line.strip())
return converted_lines
def main():
"""Pass arguments and call functions here."""
input_file = argv[1]
output_file = argv[2]
input_lines = read_lines_from_file(input_file)
converted_lines = convert_pos_tags_from_bis_to_ilmt(input_lines)
write_lines_into_file(converted_lines, output_file)
if __name__ == '__main__':
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment