ssf_reader.py 6.11 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
#!/usr/bin/python

# Copyright Riyaz Ahmad Bhat, Irshad Ahmad Bhat 2015.

import re
from collections import namedtuple, OrderedDict

class SSFReader():
    def __init__ (self, sentence):
        self.tokens = list()
        self.fs_order = list()
        self.nodeList = list()
        self.sentence = sentence
        fs_node = ('af', 'name', 'head', 'chunkId', 'chunkType', 'posn', 'vpos', 'drel', 'coref',
                'stype', 'voicetype', 'poslcat', 'mtype', 'troot', 'etype', 'etype_root', 'emph', 
                'esubtype', 'etype_name', 'agr_num', 'hon', 'agr_cas', 'agr_gen') #NOTE add node
        nodes = ('id', 'wordForm', 'posTag', 'af', 'name', 'head', 'chunkId', 'chunkType', 'posn', 
                'vpos', 'drel', 'coref', 'stype', 'voicetype', 'poslcat', 'mtype', 'troot', 'corel', 
                'parent', 'dmrel', 'etype', 'etype_root', 'emph', 'esubtype', 'etype_name', 'agr_num', 
                'hon', 'agr_cas', 'agr_gen') #NOTE add node

        self.node = namedtuple('node', nodes)
        self.maping = dict(zip(fs_node, range(len(fs_node)))) 
        self.features = namedtuple('features', ('lemma', 'cat', 'gen', 'num', 'per', 'case', 'vib', 'tam'))
    
    def morphFeatures (self, af):
        """LEMMA, CAT, GEN, NUM, PER, CASE, VIB, TAM"""
        af = af[1:-1].split(",")
        assert len(af) == 8 #NOTE no need to process trash!
        return af

    def buildNode(self, id_, form_, tag_, pairs_):
        wordForm_, Tag_, name_, head_, posn_, vpos_, chunkId_, chunkType_, depRel_, = [str()]*9 #NOTE add node
        corel_, coref_, parent_, stype_, voicetype_, features_, poslcat_, mtype_, troot_ = [str()]*9
        etype_, etype_root_, emph_, esubtype_, etype_name_, agr_num_, hon_, agr_cas_, agr_gen_ = [str()]*9 
        wordForm_, Tag_ = form_, tag_
        for key, value in pairs_.items():
            if key == "af":
                lemma_, cat_, gen_, num_, per_, case_, vib_, tam_ = self.morphFeatures(value) 
                features_ = self.features(lemma_, cat_, gen_, num_, per_, case_, vib_, tam_)
            elif key == "name":
                name_ = re.sub("'|\"", '', value) #NOTE word is used as word in deprel
            elif key == "chunkType":
                assert len(value.split(":", 1)) == 2 # no need to process trash! FIXME
                chunkType_, chunkId_ = re.sub("'|\"", '', value).split(":", 1)
            elif key == "head":
                head_ = re.sub("'|\"", '', value)
            elif key == "posn":
                posn_ = re.sub("'|\"", '', value)
            elif key == "vpos":
                vpos_ = re.sub("'|\"", '', value)
            elif key == "poslcat":
                poslcat_ = re.sub("'|\"", '', value)
            elif key == "mtype":
                mtype_ = re.sub("'|\"", '', value)
            elif key == "troot":
                troot_ = re.sub("'|\"", '', value)
            elif key == "drel":
                assert len(value.split(":", 1)) == 2 # no need to process trash! FIXME
                depRel_, parent_ = re.sub("'|\"", '', value).split(":", 1)
                assert depRel_ and parent_ # no need to process trash! FIXME
            elif key == "coref":
                try: corel_, coref_ = re.sub("'|\"", '', value).split(":")
                except ValueError: corel_, coref_ = '', re.sub("'|\"", '', value)
            elif key == "stype":
                stype_ = re.sub("'|\"", '', value)
            elif key == "voicetype":
                voicetype_ = re.sub("'|\"", '', value)
            elif key == "etype":
                etype_ = re.sub("'|\"", '', value)
            elif key == "etype_root":
                etype_root_ = re.sub("'|\"", '', value)
            elif key == "emph":
                emph_ = re.sub("'|\"", '', value)
            elif key == "esubtype":
                esubtype_ = re.sub("'|\"", '', value)
            elif key == "etype_name":
                etype_name_ = re.sub("'|\"", '', value)
            elif key == "agr_num":
                agr_num_ = re.sub("'|\"", '', value)
            elif key == "hon":
                hon_ = re.sub("'|\"", '', value)
            elif key == "agr_cas":
                agr_cas_ = re.sub("'|\"", '', value)
            elif key == "agr_gen":
                agr_gen_ = re.sub("'|\"", '', value) #NOTE add node

        self.fs_order.append([self.maping[x] for x in pairs_.keys() if x in self.maping][::-1])
        self.nodeList.append(self.node(id_, wordForm_, Tag_.decode("ascii", 'ignore').encode("ascii"),
            features_, name_, head_, chunkId_, chunkType_, posn_, vpos_, depRel_, coref_,
            stype_, voicetype_, poslcat_, mtype_, troot_, corel_, parent_, self.dmrel_,
            etype_, etype_root_, emph_, esubtype_, etype_name_, agr_num_, hon_, agr_cas_, agr_gen_)) #NOTE add node

    def FSPairs(self, FS):
        feats = OrderedDict()
        self.dmrel_ = False
        for feat in FS.split():
            if "=" not in feat: continue
            if 'dmrel' in feat:
                self.dmrel_ = True
                feat = feat.replace("dmrel", "drel")
            feat = re.sub("af='+", "af='", feat)
            feat = re.sub("af='+", "af='", feat)
            attribute, value = feat.split("=")
            feats[attribute] = value

        return feats
        
    def getAnnotations(self):
        for line in self.sentence.split("\n"):
            if '\t' not in line:
                raise ValueError('Corrupted ssf: Tabs broken into spaces')    
            line = line.split('\t')
            if line[0].isdigit():
                assert len(line) == 4 # no need to process trash! FIXME
                id_, oBraces_, Tag_ = line[:3]
                attributeValue_pairs = self.FSPairs(line[3][4:-1])
                self.buildNode(id_, oBraces_, Tag_, attributeValue_pairs)
            elif line[0].replace(".", '').isdigit():
                id_, wordForm_, Tag_ = line[:3]
                attributeValue_pairs = self.FSPairs(line[3][4:-1])
                assert wordForm_.strip() and Tag_.strip() # no need to process trash! FIXME
                self.buildNode(id_, wordForm_, Tag_, attributeValue_pairs)
            else:
                self.buildNode('', '))', '', {})

        return self