Commit 98807fc8 authored by Pruthwik's avatar Pruthwik

updated files

parent f864fdd5
# how to run the code
# python3 extract_data_from_ssf_in_conll_format.py --input InputFilePath --output OutputFilePath --level 0/1/2/3
# level argument: 0 for token, 1 for token+pos, 2 for token+pos+morph, 3 for token+pos+chunk
# no need to create an output file, only give a name
# author : Pruthwik Mishra, LTRC, IIIT-H
# also download the ssfAPI.py program.
import ssfAPI as ssf
import argparse
import re
def readFileAndExtractSentencesInConLL(inputFilePath, outputFilePath, level=0):
"""Read a file and extract sentences in conll format."""
d = ssf.Document(inputFilePath)
sentencesList = list()
print(inputFilePath)
for tree in d.nodeList:
print(tree.sentenceID)
if level == 0:
sentencesList.append('\n'.join([token for token in tree.generateSentence(
).split() if not re.search('^NUL', token)]) + '\n')
elif level == 1:
tokensWithPOS = [node.lex + '\t' + node.type.replace(
'__', '_') for chunkNode in tree.nodeList for node in chunkNode.nodeList if not re.search('^NUL', node.lex)]
sentencesList.append('\n'.join(tokensWithPOS) + '\n')
elif level == 2:
tokensWithPOSMorph = [node.lex + '\t' + node.type.replace('__', '_') + '\t' + node.getAttribute(
'af') for chunkNode in tree.nodeList for node in chunkNode.nodeList if not re.search('^NUL', node.lex)]
sentencesList.append('\n'.join(tokensWithPOSMorph) + '\n')
else:
tokenPOSAndChunk = list()
for chunkNode in tree.nodeList:
for indexNode, node in enumerate(chunkNode.nodeList):
if indexNode == 0:
if not re.search('^NUL', node.lex):
tokenPOSAndChunk.append(
node.lex + '\t' + node.type.replace('__', '_') + '\tB-' + chunkNode.type)
else:
if not re.search('^NUL', node.lex):
tokenPOSAndChunk.append(
node.lex + '\t' + node.type.replace('__', '_') + '\tI-' + chunkNode.type)
sentencesList.append('\n'.join(tokenPOSAndChunk) + '\n')
writeListToFile(sentencesList, outputFilePath)
def writeListToFile(dataList, outFilePath):
with open(outFilePath, 'w', encoding='utf-8') as fileWrite:
fileWrite.write('\n'.join(dataList) + '\n')
fileWrite.close()
def main():
"""Pass arguments and call functions here."""
parser = argparse.ArgumentParser()
parser.add_argument('--input', dest='inp',
help="Add the input file path")
parser.add_argument('--output', dest='out',
help="Add the output file path")
parser.add_argument('--level', dest='level',
help="Add the level 0: token, 1: token + pos, 2: token + pos + morph, 3 for token + pos + chunk", type=int, default=0)
args = parser.parse_args()
readFileAndExtractSentencesInConLL(args.inp, args.out, args.level)
if __name__ == '__main__':
main()
"""Evaluate chunk metrics."""
# the input file has this structure
# token\tgold-pos\tgold-chunk\tpred-chunk
# cut the predicted chunk output from the shallow parse output
# paste it with the gold-pos-chunk file
# if seqeval not installed
# install using pip install seqeval
from seqeval.metrics import classification_report
from seqeval.metrics import accuracy_score
from seqeval.metrics import f1_score
from seqeval.scheme import IOB2
from sys import argv
def read_lines_from_file(file_path):
"""Read lines from a file."""
with open(file_path, 'r', encoding='utf-8') as file_read:
return file_read.readlines()
def process_lines_prepare_gold_and_system_outputs(lines):
"""Process input lines and prepare gold and system outputs."""
gold_all, pred_all, temp_gold, temp_pred = list(), list(), list(), list()
for line in lines:
line = line.strip()
if line:
gold, pred = line.split()[-2:]
temp_gold.append(gold)
temp_pred.append(pred)
else:
assert len(temp_gold) == len(temp_pred)
gold_all.append(temp_gold)
pred_all.append(temp_pred)
temp_gold, temp_pred = list(), list()
if temp_gold and temp_pred:
assert len(temp_gold) == len(temp_pred)
gold_all.append(temp_gold)
pred_all.append(temp_pred)
return gold_all, pred_all
def generate_classification_metrics(gold, pred):
"""Generate classification metrics using seqeval package."""
class_report = ''
class_report += classification_report(gold, pred, mode='strict', scheme=IOB2) + '\n'
class_report += 'Accuracy = ' + str(accuracy_score(gold, pred)) + '\n'
class_report += 'Micro_F1 = ' + str(f1_score(gold, pred))
return class_report
def write_data_into_file(data, file_path):
"""Write data into a file."""
with open(file_path, 'w', encoding='utf-8') as file_write:
file_write.write(data + '\n')
def main():
"""Pass arguments and call functions here."""
input_file = argv[1]
output_file = argv[2]
input_lines = read_lines_from_file(input_file)
gold_all, pred_all = process_lines_prepare_gold_and_system_outputs(input_lines)
class_report = generate_classification_metrics(gold_all, pred_all)
write_data_into_file(class_report, output_file)
if __name__ == '__main__':
main()
"""Precision, recall, F1 score for POS."""
# the inputs to these program are:
# gold pos outputs, pred pos outputs and a file name
# where the classification results will be written.
# if you do not have sklearn
# install using pip install sklearn
from sys import argv
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
def readLinesFromFile(filePath):
"""Read lines from a file."""
with open(filePath, 'r', encoding='utf-8') as fileRead:
return [line.strip() for line in fileRead.readlines() if line.strip()]
def findPrecisionRecallF1score(goldLabels, predictedLabels, trueLabels=None):
"""Find Precision, Recall and F1 scores."""
return classification_report(goldLabels,
predictedLabels, target_names=trueLabels)
def main():
"""Pass arguments and call functions here."""
goldPath = argv[1]
predPath = argv[2]
outPath = argv[3]
gold = readLinesFromFile(goldPath)
predicted = readLinesFromFile(predPath)
allLabels = set(predicted).union(set(gold))
dictLabelToIndices = {label: index for index,
label in enumerate(allLabels)}
predictedIntoIndexes = [dictLabelToIndices[item] for item in predicted]
goldIntoIndexes = [dictLabelToIndices[item] for item in gold]
outDesc = open(outPath, 'w')
classReport = ''
classReport += findPrecisionRecallF1score(gold, predicted)
if len(set(predictedIntoIndexes)) == 2:
print('Micro Precision =', precision_score(goldIntoIndexes, predictedIntoIndexes, average='binary'))
print('Micro Recall =', recall_score(goldIntoIndexes, predictedIntoIndexes, average='binary'))
print('Micro F1 =', f1_score(goldIntoIndexes, predictedIntoIndexes, average='binary'))
print('Micro Accuracy =', accuracy_score(goldIntoIndexes, predictedIntoIndexes))
else:
classReport += '\n'
classReport += 'Micro_Precision = ' + str(precision_score(goldIntoIndexes, predictedIntoIndexes, average='micro')) + '\n'
print('Micro Precision =', precision_score(goldIntoIndexes, predictedIntoIndexes, average='micro'))
classReport += 'Micro_Recall = ' + str(recall_score(goldIntoIndexes, predictedIntoIndexes, average='micro')) + '\n'
print('Micro Recall =', recall_score(goldIntoIndexes, predictedIntoIndexes, average='micro'))
classReport += 'Micro_F1 = ' + str(f1_score(goldIntoIndexes, predictedIntoIndexes, average='micro')) + '\n'
print('Micro F1 =', f1_score(goldIntoIndexes, predictedIntoIndexes, average='micro'))
classReport += 'Micro_Accuracy = ' + str(accuracy_score(goldIntoIndexes, predictedIntoIndexes)) + '\n'
print('Micro Accuracy =', accuracy_score(goldIntoIndexes, predictedIntoIndexes))
outDesc.write(classReport + '\n')
outDesc.close()
if __name__ == '__main__':
main()
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment