diff --git a/Odia-Evaluation-Code/create_features_for_crf_from_conll_pos_data_with_simple_splitting.py b/Odia-Evaluation-Code/create_features_for_crf_from_conll_pos_data_with_simple_splitting.py new file mode 100644 index 0000000000000000000000000000000000000000..ac3ee202343fdfdd62791e40c023a1cb665513a6 --- /dev/null +++ b/Odia-Evaluation-Code/create_features_for_crf_from_conll_pos_data_with_simple_splitting.py @@ -0,0 +1,105 @@ +"""Tokenize the sentences and create features for testing with CRF.""" +import argparse +import re + + +def read_lines_from_file(file_path): + """ + Read lines from file. + + Args: + file_path: Enter the file path + + Returns: + lines: Lines from file + """ + with open(file_path, 'r', encoding='utf-8') as file_read: + return [line.strip() for line in file_read.readlines() if line.strip()] + + +def read_file_and_find_features_from_sentences(file_path): + """ + Read lines from file and find the features. + + Args: + file_path: Enter the file path + + Returns: + """ + features_string = '' + lines = read_lines_from_file(file_path) + features_string = find_features_from_sentences(lines) + return features_string + + +def find_features_from_sentences(sentences): + """ + Find features for all the sentences. + + :param sentences: Sentences read from file + :return features: Features of all tokens for each sentence combined for all the sentences + """ + prefix_len = 4 + suffix_len = 7 + features = '' + for sentence in sentences: + sentence_features = '' + tokens = sentence.split() + for token in tokens: + sentence_features += token + '\t' + for i in range(1, prefix_len + 1): + sentence_features += affix_feats(token, i, 0) + '\t' + for i in range(1, suffix_len + 1): + sentence_features += affix_feats(token, i, 1) + '\t' + sentence_features = sentence_features + 'LESS\n' if len(token) <= 4 else sentence_features + 'MORE\n' + if sentence_features.strip(): + features += sentence_features + '\n' + return features + + +def affix_feats(token, length, type_aff): + """ + Find features related to affixes. + + :param line: extract the token and its corresponding suffix list depending on its length + :param token: the token in the line + :param length: length of affix + :param type: 0 for prefix and 1 for suffix + :return suffix: returns the suffix + """ + if len(token) < length: + return 'NULL' + else: + if type_aff == 0: + return token[:length] + else: + return token[len(token) - length:] + + +def write_file(out_path, data): + """ + Write text to file. + + :param out_path: Enter the path of the output file + :param data: Enter the token features of sentence separated by a blank line + :return: None + """ + with open(out_path, 'w+', encoding='utf-8') as fout: + fout.write(data) + fout.close() + + +def main(): + """ + Pass arguments and call functions here. + """ + parser = argparse.ArgumentParser() + parser.add_argument('--input', dest='inp', help="Add the input path from where tokens and its features will be extracted") + parser.add_argument('--output', dest='out', help="Add the output file where the features will be saved") + args = parser.parse_args() + features_extracted = read_file_and_find_features_from_sentences(args.inp) + write_file(args.out, features_extracted) + + +if __name__ == '__main__': + main() diff --git a/Odia-Evaluation-Code/odia_pos_4k_model b/Odia-Evaluation-Code/odia_pos_4k_model new file mode 100644 index 0000000000000000000000000000000000000000..db88b799e1e5db2de00ac93edb3097394aaac2e3 Binary files /dev/null and b/Odia-Evaluation-Code/odia_pos_4k_model differ diff --git a/Odia-Evaluation-Code/run_odia_pos_tagger_with_simple_splitting.sh b/Odia-Evaluation-Code/run_odia_pos_tagger_with_simple_splitting.sh new file mode 100644 index 0000000000000000000000000000000000000000..4d67b96be82e93f83606274e5614f14c381bf03c --- /dev/null +++ b/Odia-Evaluation-Code/run_odia_pos_tagger_with_simple_splitting.sh @@ -0,0 +1,8 @@ +# how to run this +# sh run_odia_pos_tagger.sh input_file_path output_file_path +# for output_file_path, give just a name +input_file=$1 +ouput_file=$2 +python create_features_for_crf_from_conll_pos_data_with_simple_splitting.py --input $input_file --output input-conll-pos-features.txt +crf_test -m odia_pos_4k_model input-conll-pos-features.txt > input_features_pos_predicted.txt +cut -f1,14 input_features_pos_predicted.txt > $ouput_file