diff --git a/Code/convert_bis_pos_tag_to_ilmt.py b/Code/convert_bis_pos_tag_to_ilmt.py new file mode 100644 index 0000000000000000000000000000000000000000..9de0233ceaeb9276e949fab99f98a674bfb1ac52 --- /dev/null +++ b/Code/convert_bis_pos_tag_to_ilmt.py @@ -0,0 +1,97 @@ +from re import search +from sys import argv + + +def bis_to_ilmt_conversion(bis_pos_tag): + """Convert a BIS pos tag to ILMT pos tag.""" + ilmt_tag = '' + bis_pos_tag = bis_pos_tag.replace('__', '_') + if bis_pos_tag == 'N_NNV': + ilmt_tag = 'VM' + elif search('N_(N.*)', bis_pos_tag): + print(search('N_(N.*)', bis_pos_tag)) + ilmt_tag = search('N_(N.*)', bis_pos_tag).group(1) + elif bis_pos_tag == 'PR_PRQ': + ilmt_tag = 'WQ' + elif search('PR_PR.*', bis_pos_tag): + ilmt_tag = 'PRP' + elif search('DM_.*', bis_pos_tag): + ilmt_tag = 'DEM' + elif search('V_VM.*', bis_pos_tag): + ilmt_tag = 'VM' + elif bis_pos_tag == 'JJ': + ilmt_tag = 'JJ' + elif bis_pos_tag == 'RB': + ilmt_tag = 'RB' + elif bis_pos_tag == 'PSP': + ilmt_tag = 'PSP' + elif search('V_VAUX', bis_pos_tag): + ilmt_tag = 'VAUX' + elif bis_pos_tag == 'CC_CCS_UT': + ilmt_tag = 'UT' + elif search('CC.*', bis_pos_tag): + ilmt_tag = 'CC' + elif bis_pos_tag == 'RP_RPD': + ilmt_tag = 'RP' + elif search('RP_(.+)', bis_pos_tag): + ilmt_tag = search('RP_(.+)', bis_pos_tag).group(1) + elif search('QT.*', bis_pos_tag): + if bis_pos_tag[-1] == 'F': + ilmt_tag = 'QF' + elif bis_pos_tag[-1] == 'C': + ilmt_tag = 'QC' + elif bis_pos_tag[-1] == 'O': + ilmt_tag = 'QO' + elif bis_pos_tag in ['RD_PUNC', 'RD_SYM']: + ilmt_tag = 'SYM' + elif bis_pos_tag == 'RD_ECH': + ilmt_tag = 'ECH' + elif bis_pos_tag == 'RD_RDF': + ilmt_tag = 'FW' + elif bis_pos_tag == 'RD_UNK': + ilmt_tag = 'UNK' + if not ilmt_tag: + print('BLANK') + ilmt_tag = 'NN' + return ilmt_tag + + +def read_lines_from_file(file_path): + """Read lines from a file.""" + with open(file_path, 'r', encoding='utf-8') as file_read: + return file_read.readlines() + + +def write_lines_into_file(lines, file_path): + """Write lines into a file.""" + with open(file_path, 'w', encoding='utf-8') as file_write: + file_write.write('\n'.join(lines)) + + +def convert_pos_tags_from_bis_to_ilmt(lines): + """Convert BIS pos tags into ILMT tags.""" + converted_lines = list() + for line in lines: + if line.strip(): + line_split = line.strip().split('\t') + ilmt_tag = bis_to_ilmt_conversion(line_split[1]) + if len(line_split) == 2: + converted_lines.append('\t'.join([line_split[0], ilmt_tag])) + else: + converted_lines.append('\t'.join([line_split[0], ilmt_tag, line_split[2]])) + else: + converted_lines.append(line.strip()) + return converted_lines + + +def main(): + """Pass arguments and call functions here.""" + input_file = argv[1] + output_file = argv[2] + input_lines = read_lines_from_file(input_file) + converted_lines = convert_pos_tags_from_bis_to_ilmt(input_lines) + write_lines_into_file(converted_lines, output_file) + + +if __name__ == '__main__': + main()