updated files

98807fc8 · Pruthwik · f864fdd5 · 98807fc8 · 98807fc8 · 98807fc8
Commit 98807fc8 authored May 04, 2022 by Pruthwik
4 changed files
--- a/Code/extract_data_from_ssf_in_conll_format_for_file.py
+++ b/Code/extract_data_from_ssf_in_conll_format_for_file.py
+# how to run the code
+# python3 extract_data_from_ssf_in_conll_format.py --input InputFilePath --output OutputFilePath --level 0/1/2/3
+# level argument: 0 for token, 1 for token+pos, 2 for token+pos+morph, 3 for token+pos+chunk
+# no need to create an output file, only give a name
+# author : Pruthwik Mishra, LTRC, IIIT-H
+# also download the ssfAPI.py program.
+import ssfAPI as ssf
+import argparse
+import re
+
+
+def readFileAndExtractSentencesInConLL(inputFilePath, outputFilePath, level=0):
+    """Read a file and extract sentences in conll format."""
+    d = ssf.Document(inputFilePath)
+    sentencesList = list()
+    print(inputFilePath)
+    for tree in d.nodeList:
+        print(tree.sentenceID)
+        if level == 0:
+            sentencesList.append('\n'.join([token for token in tree.generateSentence(
+            ).split() if not re.search('^NUL', token)]) + '\n')
+        elif level == 1:
+            tokensWithPOS = [node.lex + '\t' + node.type.replace(
+                '__', '_') for chunkNode in tree.nodeList for node in chunkNode.nodeList if not re.search('^NUL', node.lex)]
+            sentencesList.append('\n'.join(tokensWithPOS) + '\n')
+        elif level == 2:
+            tokensWithPOSMorph = [node.lex + '\t' + node.type.replace('__', '_') + '\t' + node.getAttribute(
+                'af') for chunkNode in tree.nodeList for node in chunkNode.nodeList if not re.search('^NUL', node.lex)]
+            sentencesList.append('\n'.join(tokensWithPOSMorph) + '\n')
+        else:
+            tokenPOSAndChunk = list()
+            for chunkNode in tree.nodeList:
+                for indexNode, node in enumerate(chunkNode.nodeList):
+                    if indexNode == 0:
+                        if not re.search('^NUL', node.lex):
+                            tokenPOSAndChunk.append(
+                                node.lex + '\t' + node.type.replace('__', '_') + '\tB-' + chunkNode.type)
+                    else:
+                        if not re.search('^NUL', node.lex):
+                            tokenPOSAndChunk.append(
+                                node.lex + '\t' + node.type.replace('__', '_') + '\tI-' + chunkNode.type)
+            sentencesList.append('\n'.join(tokenPOSAndChunk) + '\n')
+    writeListToFile(sentencesList, outputFilePath)
+
+
+def writeListToFile(dataList, outFilePath):
+    with open(outFilePath, 'w', encoding='utf-8') as fileWrite:
+        fileWrite.write('\n'.join(dataList) + '\n')
+        fileWrite.close()
+
+
+def main():
+    """Pass arguments and call functions here."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', dest='inp',
+                        help="Add the input file path")
+    parser.add_argument('--output', dest='out',
+                        help="Add the output file path")
+    parser.add_argument('--level', dest='level',
+                        help="Add the level 0: token, 1: token + pos, 2: token + pos + morph, 3 for token + pos + chunk", type=int, default=0)
+    args = parser.parse_args()
+    readFileAndExtractSentencesInConLL(args.inp, args.out, args.level)
+
+
+if __name__ == '__main__':
+    main()
--- a/Code/precision_recall_f1_score_chunking.py
+++ b/Code/precision_recall_f1_score_chunking.py
+"""Evaluate chunk metrics."""
+# the input file has this structure
+# token\tgold-pos\tgold-chunk\tpred-chunk
+# cut the predicted chunk output from the shallow parse output
+# paste it with the gold-pos-chunk file
+# if seqeval not installed
+# install using pip install seqeval
+from seqeval.metrics import classification_report
+from seqeval.metrics import accuracy_score
+from seqeval.metrics import f1_score
+from seqeval.scheme import IOB2
+from sys import argv
+
+
+def read_lines_from_file(file_path):
+    """Read lines from a file."""
+    with open(file_path, 'r', encoding='utf-8') as file_read:
+        return file_read.readlines()
+
+
+def process_lines_prepare_gold_and_system_outputs(lines):
+    """Process input lines and prepare gold and system outputs."""
+    gold_all, pred_all, temp_gold, temp_pred = list(), list(), list(), list()
+    for line in lines:
+        line = line.strip()
+        if line:
+            gold, pred = line.split()[-2:]
+            temp_gold.append(gold)
+            temp_pred.append(pred)
+        else:
+            assert len(temp_gold) == len(temp_pred)
+            gold_all.append(temp_gold)
+            pred_all.append(temp_pred)
+            temp_gold, temp_pred = list(), list()
+    if temp_gold and temp_pred:
+        assert len(temp_gold) == len(temp_pred)
+        gold_all.append(temp_gold)
+        pred_all.append(temp_pred)
+    return gold_all, pred_all
+
+
+def generate_classification_metrics(gold, pred):
+    """Generate classification metrics using seqeval package."""
+    class_report = ''
+    class_report += classification_report(gold, pred, mode='strict', scheme=IOB2) + '\n'
+    class_report += 'Accuracy = ' + str(accuracy_score(gold, pred)) + '\n'
+    class_report += 'Micro_F1 = ' + str(f1_score(gold, pred))
+    return class_report
+
+
+def write_data_into_file(data, file_path):
+    """Write data into a file."""
+    with open(file_path, 'w', encoding='utf-8') as file_write:
+        file_write.write(data + '\n')
+
+
+def main():
+    """Pass arguments and call functions here."""
+    input_file = argv[1]
+    output_file = argv[2]
+    input_lines = read_lines_from_file(input_file)
+    gold_all, pred_all = process_lines_prepare_gold_and_system_outputs(input_lines)
+    class_report = generate_classification_metrics(gold_all, pred_all)
+    write_data_into_file(class_report, output_file)
+
+
+if __name__ == '__main__':
+    main()
--- a/Code/precision_recall_f1_score_pos.py
+++ b/Code/precision_recall_f1_score_pos.py
+"""Precision, recall, F1 score for POS."""
+# the inputs to these program are:
+# gold pos outputs, pred pos outputs and a file name
+# where the classification results will be written.
+# if you do not have sklearn
+# install using pip install sklearn
+from sys import argv
+from sklearn.metrics import classification_report
+from sklearn.metrics import f1_score
+from sklearn.metrics import precision_score
+from sklearn.metrics import recall_score
+from sklearn.metrics import accuracy_score
+
+
+def readLinesFromFile(filePath):
+    """Read lines from a file."""
+    with open(filePath, 'r', encoding='utf-8') as fileRead:
+        return [line.strip() for line in fileRead.readlines() if line.strip()]
+
+
+def findPrecisionRecallF1score(goldLabels, predictedLabels, trueLabels=None):
+    """Find Precision, Recall and F1 scores."""
+    return classification_report(goldLabels,
+                                 predictedLabels, target_names=trueLabels)
+
+
+def main():
+    """Pass arguments and call functions here."""
+    goldPath = argv[1]
+    predPath = argv[2]
+    outPath = argv[3]
+    gold = readLinesFromFile(goldPath)
+    predicted = readLinesFromFile(predPath)
+    allLabels = set(predicted).union(set(gold))
+    dictLabelToIndices = {label: index for index,
+                          label in enumerate(allLabels)}
+    predictedIntoIndexes = [dictLabelToIndices[item] for item in predicted]
+    goldIntoIndexes = [dictLabelToIndices[item] for item in gold]
+    outDesc = open(outPath, 'w')
+    classReport = ''
+    classReport += findPrecisionRecallF1score(gold, predicted)
+    if len(set(predictedIntoIndexes)) == 2:
+        print('Micro Precision =', precision_score(goldIntoIndexes, predictedIntoIndexes, average='binary'))
+        print('Micro Recall =', recall_score(goldIntoIndexes, predictedIntoIndexes, average='binary'))
+        print('Micro F1 =', f1_score(goldIntoIndexes, predictedIntoIndexes, average='binary'))
+        print('Micro Accuracy =', accuracy_score(goldIntoIndexes, predictedIntoIndexes))
+    else:
+        classReport += '\n'
+        classReport += 'Micro_Precision = ' + str(precision_score(goldIntoIndexes, predictedIntoIndexes, average='micro')) + '\n'
+        print('Micro Precision =', precision_score(goldIntoIndexes, predictedIntoIndexes, average='micro'))
+        classReport += 'Micro_Recall = ' + str(recall_score(goldIntoIndexes, predictedIntoIndexes, average='micro')) + '\n'
+        print('Micro Recall =', recall_score(goldIntoIndexes, predictedIntoIndexes, average='micro'))
+        classReport += 'Micro_F1 = ' + str(f1_score(goldIntoIndexes, predictedIntoIndexes, average='micro')) + '\n'
+        print('Micro F1 =', f1_score(goldIntoIndexes, predictedIntoIndexes, average='micro'))
+        classReport += 'Micro_Accuracy = ' + str(accuracy_score(goldIntoIndexes, predictedIntoIndexes)) + '\n'
+        print('Micro Accuracy =', accuracy_score(goldIntoIndexes, predictedIntoIndexes))
+    outDesc.write(classReport + '\n')
+    outDesc.close()
+
+
+if __name__ == '__main__':
+    main()
--- a/Code/ssfAPI.py
+++ b/Code/ssfAPI.py