Odia Eval Code

25dc105b · Pruthwik · ac5982d3 · 25dc105b · 25dc105b · 25dc105b
Commit 25dc105b authored May 24, 2022 by Pruthwik
3 changed files
--- a/Odia-Evaluation-Code/create_features_for_crf_from_conll_pos_data_with_simple_splitting.py
+++ b/Odia-Evaluation-Code/create_features_for_crf_from_conll_pos_data_with_simple_splitting.py
+"""Tokenize the sentences and create features for testing with CRF."""
+import argparse
+import re
+def read_lines_from_file(file_path):
+    """
+    Read lines from file.
+    Args:
+    file_path: Enter the file path
+    Returns:
+    lines: Lines from file
+    """
+    with open(file_path, 'r', encoding='utf-8') as file_read:
+        return [line.strip() for line in file_read.readlines() if line.strip()]
+def read_file_and_find_features_from_sentences(file_path):
+    """
+    Read lines from file and find the features.
+    Args:
+    file_path: Enter the file path
+    Returns:
+    """
+    features_string = ''
+    lines = read_lines_from_file(file_path)
+    features_string = find_features_from_sentences(lines)
+    return features_string
+def find_features_from_sentences(sentences):
+    """
+    Find features for all the sentences.
+    :param sentences: Sentences read from file
+    :return features: Features of all tokens for each sentence combined for all the sentences
+    """
+    prefix_len = 4
+    suffix_len = 7
+    features = ''
+    for sentence in sentences:
+        sentence_features = ''
+        tokens = sentence.split()
+        for token in tokens:
+            sentence_features += token + '\t'
+            for i in range(1, prefix_len + 1):
+                sentence_features += affix_feats(token, i, 0) + '\t'
+            for i in range(1, suffix_len + 1):
+                sentence_features += affix_feats(token, i, 1) + '\t'
+            sentence_features = sentence_features + 'LESS\n' if len(token) <= 4 else sentence_features + 'MORE\n'
+        if sentence_features.strip():
+            features += sentence_features + '\n'
+    return features
+def affix_feats(token, length, type_aff):
+    """
+    Find features related to affixes.
+    :param line: extract the token and its corresponding suffix list depending on its length
+    :param token: the token in the line
+    :param length: length of affix
+    :param type: 0 for prefix and 1 for suffix
+    :return suffix: returns the suffix
+    """
+    if len(token) < length:
+        return 'NULL'
+    else:
+        if type_aff == 0:
+            return token[:length]
+        else:
+            return token[len(token) - length:]
+def write_file(out_path, data):
+    """
+    Write text to file.
+    :param out_path: Enter the path of the output file
+    :param data: Enter the token features of sentence separated by a blank line
+    :return: None
+    """
+    with open(out_path, 'w+', encoding='utf-8') as fout:
+        fout.write(data)
+        fout.close()
+def main():
+    """
+    Pass arguments and call functions here.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', dest='inp', help="Add the input path from where tokens and its features will be extracted")
+    parser.add_argument('--output', dest='out', help="Add the output file where the features will be saved")
+    args = parser.parse_args()
+    features_extracted = read_file_and_find_features_from_sentences(args.inp)
+    write_file(args.out, features_extracted)
+if __name__ == '__main__':
+    main()
--- a/Odia-Evaluation-Code/odia_pos_4k_model
+++ b/Odia-Evaluation-Code/odia_pos_4k_model
--- a/Odia-Evaluation-Code/run_odia_pos_tagger_with_simple_splitting.sh
+++ b/Odia-Evaluation-Code/run_odia_pos_tagger_with_simple_splitting.sh
+# how to run this
+# sh run_odia_pos_tagger.sh input_file_path output_file_path
+# for output_file_path, give just a name
+input_file=$1
+ouput_file=$2
+python create_features_for_crf_from_conll_pos_data_with_simple_splitting.py --input $input_file --output input-conll-pos-features.txt
+crf_test -m odia_pos_4k_model input-conll-pos-features.txt > input_features_pos_predicted.txt
+cut -f1,14 input_features_pos_predicted.txt > $ouput_file