Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
S
Shallow-Parser-Evaluation
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Packages & Registries
Packages & Registries
Package Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
pruthwik mishra
Shallow-Parser-Evaluation
Commits
98807fc8
Commit
98807fc8
authored
May 04, 2022
by
Pruthwik
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated files
parent
f864fdd5
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
639 additions
and
0 deletions
+639
-0
Code/extract_data_from_ssf_in_conll_format_for_file.py
Code/extract_data_from_ssf_in_conll_format_for_file.py
+66
-0
Code/precision_recall_f1_score_chunking.py
Code/precision_recall_f1_score_chunking.py
+68
-0
Code/precision_recall_f1_score_pos.py
Code/precision_recall_f1_score_pos.py
+62
-0
Code/ssfAPI.py
Code/ssfAPI.py
+443
-0
No files found.
Code/extract_data_from_ssf_in_conll_format_for_file.py
0 → 100644
View file @
98807fc8
# how to run the code
# python3 extract_data_from_ssf_in_conll_format.py --input InputFilePath --output OutputFilePath --level 0/1/2/3
# level argument: 0 for token, 1 for token+pos, 2 for token+pos+morph, 3 for token+pos+chunk
# no need to create an output file, only give a name
# author : Pruthwik Mishra, LTRC, IIIT-H
# also download the ssfAPI.py program.
import
ssfAPI
as
ssf
import
argparse
import
re
def
readFileAndExtractSentencesInConLL
(
inputFilePath
,
outputFilePath
,
level
=
0
):
"""Read a file and extract sentences in conll format."""
d
=
ssf
.
Document
(
inputFilePath
)
sentencesList
=
list
()
print
(
inputFilePath
)
for
tree
in
d
.
nodeList
:
print
(
tree
.
sentenceID
)
if
level
==
0
:
sentencesList
.
append
(
'
\n
'
.
join
([
token
for
token
in
tree
.
generateSentence
(
).
split
()
if
not
re
.
search
(
'^NUL'
,
token
)])
+
'
\n
'
)
elif
level
==
1
:
tokensWithPOS
=
[
node
.
lex
+
'
\t
'
+
node
.
type
.
replace
(
'__'
,
'_'
)
for
chunkNode
in
tree
.
nodeList
for
node
in
chunkNode
.
nodeList
if
not
re
.
search
(
'^NUL'
,
node
.
lex
)]
sentencesList
.
append
(
'
\n
'
.
join
(
tokensWithPOS
)
+
'
\n
'
)
elif
level
==
2
:
tokensWithPOSMorph
=
[
node
.
lex
+
'
\t
'
+
node
.
type
.
replace
(
'__'
,
'_'
)
+
'
\t
'
+
node
.
getAttribute
(
'af'
)
for
chunkNode
in
tree
.
nodeList
for
node
in
chunkNode
.
nodeList
if
not
re
.
search
(
'^NUL'
,
node
.
lex
)]
sentencesList
.
append
(
'
\n
'
.
join
(
tokensWithPOSMorph
)
+
'
\n
'
)
else
:
tokenPOSAndChunk
=
list
()
for
chunkNode
in
tree
.
nodeList
:
for
indexNode
,
node
in
enumerate
(
chunkNode
.
nodeList
):
if
indexNode
==
0
:
if
not
re
.
search
(
'^NUL'
,
node
.
lex
):
tokenPOSAndChunk
.
append
(
node
.
lex
+
'
\t
'
+
node
.
type
.
replace
(
'__'
,
'_'
)
+
'
\t
B-'
+
chunkNode
.
type
)
else
:
if
not
re
.
search
(
'^NUL'
,
node
.
lex
):
tokenPOSAndChunk
.
append
(
node
.
lex
+
'
\t
'
+
node
.
type
.
replace
(
'__'
,
'_'
)
+
'
\t
I-'
+
chunkNode
.
type
)
sentencesList
.
append
(
'
\n
'
.
join
(
tokenPOSAndChunk
)
+
'
\n
'
)
writeListToFile
(
sentencesList
,
outputFilePath
)
def
writeListToFile
(
dataList
,
outFilePath
):
with
open
(
outFilePath
,
'w'
,
encoding
=
'utf-8'
)
as
fileWrite
:
fileWrite
.
write
(
'
\n
'
.
join
(
dataList
)
+
'
\n
'
)
fileWrite
.
close
()
def
main
():
"""Pass arguments and call functions here."""
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--input'
,
dest
=
'inp'
,
help
=
"Add the input file path"
)
parser
.
add_argument
(
'--output'
,
dest
=
'out'
,
help
=
"Add the output file path"
)
parser
.
add_argument
(
'--level'
,
dest
=
'level'
,
help
=
"Add the level 0: token, 1: token + pos, 2: token + pos + morph, 3 for token + pos + chunk"
,
type
=
int
,
default
=
0
)
args
=
parser
.
parse_args
()
readFileAndExtractSentencesInConLL
(
args
.
inp
,
args
.
out
,
args
.
level
)
if
__name__
==
'__main__'
:
main
()
Code/precision_recall_f1_score_chunking.py
0 → 100644
View file @
98807fc8
"""Evaluate chunk metrics."""
# the input file has this structure
# token\tgold-pos\tgold-chunk\tpred-chunk
# cut the predicted chunk output from the shallow parse output
# paste it with the gold-pos-chunk file
# if seqeval not installed
# install using pip install seqeval
from
seqeval.metrics
import
classification_report
from
seqeval.metrics
import
accuracy_score
from
seqeval.metrics
import
f1_score
from
seqeval.scheme
import
IOB2
from
sys
import
argv
def
read_lines_from_file
(
file_path
):
"""Read lines from a file."""
with
open
(
file_path
,
'r'
,
encoding
=
'utf-8'
)
as
file_read
:
return
file_read
.
readlines
()
def
process_lines_prepare_gold_and_system_outputs
(
lines
):
"""Process input lines and prepare gold and system outputs."""
gold_all
,
pred_all
,
temp_gold
,
temp_pred
=
list
(),
list
(),
list
(),
list
()
for
line
in
lines
:
line
=
line
.
strip
()
if
line
:
gold
,
pred
=
line
.
split
()[
-
2
:]
temp_gold
.
append
(
gold
)
temp_pred
.
append
(
pred
)
else
:
assert
len
(
temp_gold
)
==
len
(
temp_pred
)
gold_all
.
append
(
temp_gold
)
pred_all
.
append
(
temp_pred
)
temp_gold
,
temp_pred
=
list
(),
list
()
if
temp_gold
and
temp_pred
:
assert
len
(
temp_gold
)
==
len
(
temp_pred
)
gold_all
.
append
(
temp_gold
)
pred_all
.
append
(
temp_pred
)
return
gold_all
,
pred_all
def
generate_classification_metrics
(
gold
,
pred
):
"""Generate classification metrics using seqeval package."""
class_report
=
''
class_report
+=
classification_report
(
gold
,
pred
,
mode
=
'strict'
,
scheme
=
IOB2
)
+
'
\n
'
class_report
+=
'Accuracy = '
+
str
(
accuracy_score
(
gold
,
pred
))
+
'
\n
'
class_report
+=
'Micro_F1 = '
+
str
(
f1_score
(
gold
,
pred
))
return
class_report
def
write_data_into_file
(
data
,
file_path
):
"""Write data into a file."""
with
open
(
file_path
,
'w'
,
encoding
=
'utf-8'
)
as
file_write
:
file_write
.
write
(
data
+
'
\n
'
)
def
main
():
"""Pass arguments and call functions here."""
input_file
=
argv
[
1
]
output_file
=
argv
[
2
]
input_lines
=
read_lines_from_file
(
input_file
)
gold_all
,
pred_all
=
process_lines_prepare_gold_and_system_outputs
(
input_lines
)
class_report
=
generate_classification_metrics
(
gold_all
,
pred_all
)
write_data_into_file
(
class_report
,
output_file
)
if
__name__
==
'__main__'
:
main
()
Code/precision_recall_f1_score_pos.py
0 → 100644
View file @
98807fc8
"""Precision, recall, F1 score for POS."""
# the inputs to these program are:
# gold pos outputs, pred pos outputs and a file name
# where the classification results will be written.
# if you do not have sklearn
# install using pip install sklearn
from
sys
import
argv
from
sklearn.metrics
import
classification_report
from
sklearn.metrics
import
f1_score
from
sklearn.metrics
import
precision_score
from
sklearn.metrics
import
recall_score
from
sklearn.metrics
import
accuracy_score
def
readLinesFromFile
(
filePath
):
"""Read lines from a file."""
with
open
(
filePath
,
'r'
,
encoding
=
'utf-8'
)
as
fileRead
:
return
[
line
.
strip
()
for
line
in
fileRead
.
readlines
()
if
line
.
strip
()]
def
findPrecisionRecallF1score
(
goldLabels
,
predictedLabels
,
trueLabels
=
None
):
"""Find Precision, Recall and F1 scores."""
return
classification_report
(
goldLabels
,
predictedLabels
,
target_names
=
trueLabels
)
def
main
():
"""Pass arguments and call functions here."""
goldPath
=
argv
[
1
]
predPath
=
argv
[
2
]
outPath
=
argv
[
3
]
gold
=
readLinesFromFile
(
goldPath
)
predicted
=
readLinesFromFile
(
predPath
)
allLabels
=
set
(
predicted
).
union
(
set
(
gold
))
dictLabelToIndices
=
{
label
:
index
for
index
,
label
in
enumerate
(
allLabels
)}
predictedIntoIndexes
=
[
dictLabelToIndices
[
item
]
for
item
in
predicted
]
goldIntoIndexes
=
[
dictLabelToIndices
[
item
]
for
item
in
gold
]
outDesc
=
open
(
outPath
,
'w'
)
classReport
=
''
classReport
+=
findPrecisionRecallF1score
(
gold
,
predicted
)
if
len
(
set
(
predictedIntoIndexes
))
==
2
:
print
(
'Micro Precision ='
,
precision_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'binary'
))
print
(
'Micro Recall ='
,
recall_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'binary'
))
print
(
'Micro F1 ='
,
f1_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'binary'
))
print
(
'Micro Accuracy ='
,
accuracy_score
(
goldIntoIndexes
,
predictedIntoIndexes
))
else
:
classReport
+=
'
\n
'
classReport
+=
'Micro_Precision = '
+
str
(
precision_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'micro'
))
+
'
\n
'
print
(
'Micro Precision ='
,
precision_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'micro'
))
classReport
+=
'Micro_Recall = '
+
str
(
recall_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'micro'
))
+
'
\n
'
print
(
'Micro Recall ='
,
recall_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'micro'
))
classReport
+=
'Micro_F1 = '
+
str
(
f1_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'micro'
))
+
'
\n
'
print
(
'Micro F1 ='
,
f1_score
(
goldIntoIndexes
,
predictedIntoIndexes
,
average
=
'micro'
))
classReport
+=
'Micro_Accuracy = '
+
str
(
accuracy_score
(
goldIntoIndexes
,
predictedIntoIndexes
))
+
'
\n
'
print
(
'Micro Accuracy ='
,
accuracy_score
(
goldIntoIndexes
,
predictedIntoIndexes
))
outDesc
.
write
(
classReport
+
'
\n
'
)
outDesc
.
close
()
if
__name__
==
'__main__'
:
main
()
Code/ssfAPI.py
0 → 100755
View file @
98807fc8
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment