#!/usr/bin/python # Author: Himanshu Sharma # changes added by Pruthwik Mishra import os # import sys import codecs import re from collections import OrderedDict class Node(): def __init__(self, text): self.text = text self.lex = None self.type = None self.__attributes = OrderedDict() self.errors = [] self.name = None self.parent = None self.parentRelation = None self.alignedTo = None self.fsList = None self.analyzeNode(self.text) def analyzeNode(self, text): [token, tokenType, fsDict, fsList] = getTokenFeats( text.strip().split()) attributeUpdateStatus = self.updateAttributes( token, tokenType, fsDict, fsList) if attributeUpdateStatus == 0: self.errors.append("Can't update attributes for node") self.probSent = True def updateAttributes(self, token, tokenType, fsDict, fsList): self.fsList = fsList self.lex = token self.type = tokenType for attribute in fsDict.keys(): self.__attributes[attribute] = fsDict[attribute] self.assignName() def assignName(self): if self.__attributes.get('name') is not None: self.name = self.getAttribute('name') else: self.errors.append('No name for this token Node') def printValue(self): return self.lex def printSSFValue(self, prefix, allFeat): returnValue = [prefix, self.printValue(), self.type] if allFeat is False: fs = ['' else: fs = self.fsList delim = '|' return ['\t'.join(x for x in returnValue) + '\t' + delim.join(x for x in fs)] def getAttribute(self, key): if key in self.__attributes: return self.__attributes[key] else: return None def addAttribute(self, key, value): self.__attributes[key] = value def deleteAttribute(self, key): del self.__attributes[key] class ChunkNode(): def __init__(self, header): self.text = [] self.header = header self.footer = None self.nodeList = [] self.parent = '0' self.__attributes = OrderedDict() self.parentRelation = 'root' self.name = None self.head = None self.isParent = False self.errors = [] self.upper = None self.updateDrel() self.type = None self.fsList = None def analyzeChunk(self): [chunkType, chunkFeatDict, chunkFSList] = getChunkFeats(self.header) self.fsList = chunkFSList self.type = chunkType self.updateAttributes(chunkFeatDict) self.text = '\n'.join([line for line in self.text]) def updateAttributes(self, fsDict): for attribute in fsDict.keys(): self.__attributes[attribute] = fsDict[attribute] self.assignName() self.updateDrel() def assignName(self): if 'name' in self.__attributes: self.name = self.getAttribute('name') else: self.errors.append('No name for this chunk Node') def updateDrel(self): if 'drel' in self.__attributes: drelList = self.getAttribute('drel').split(':') if len(drelList) == 2: self.parent = drelList[1] self.parentRelation = self.getAttribute('drel').split(':')[0] elif 'dmrel' in self.__attributes: drelList = self.getAttribute('dmrel').split(':') if len(drelList) == 2: self.parent = drelList[1] self.parentRelation = self.getAttribute('dmrel').split(':')[0] def printValue(self): returnString = [] for node in self.nodeList: returnString.append(node.printValue()) return ' '.join(x for x in returnString) def printSSFValue(self, prefix, allFeat): returnStringList = [] returnValue = [prefix, '((', self.type] if allFeat is False: fs = ['' else: fs = self.fsList delim = '|' returnStringList.append( '\t'.join(x for x in returnValue) + '\t' + delim.join(x for x in fs)) nodePosn = 0 for node in self.nodeList: nodePosn += 1 if isinstance(node, ChunkNode): returnStringList.extend( node.printSSFValue(prefix + '.' + str(nodePosn), allFeat)) else: returnStringList.extend( node.printSSFValue(prefix + '.' + str(nodePosn), allFeat)) returnStringList.append('\t' + '))') return returnStringList def getAttribute(self, key): if key in self.__attributes: return self.__attributes[key] else: return None def addAttribute(self, key, value): self.__attributes[key] = value def deleteAttribute(self, key): del self.__attributes[key] class Sentence(): def __init__(self, sentence, ignoreErrors=True, nesting=True, dummySentence=False): self.ignoreErrors = ignoreErrors self.nesting = nesting self.sentence = None self.sentenceID = None self.sentenceType = None self.length = 0 self.tree = None self.nodeList = [] self.edges = {} self.nodes = {} self.tokenNodes = {} self.rootNode = None self.fileName = None self.comment = None self.probSent = False self.errors = [] self.text = sentence self.dummySentence = dummySentence if self.dummySentence is False: # self.header = sentence.group('header') # self.footer = sentence.group('footer') # self.name = sentence.group('sentenceID') # self.text = sentence.group('text') self.analyzeSentence() def analyzeSentence(self, ignoreErrors=False, nesting=True): lastContext = self for line in self.text.split('\n'): stripLine = line.strip() if stripLine == "": continue elif stripLine[0] == "<" and ignoreErrors is False: self.errors.append('Encountered a line starting with "<"') self.probSent = True else: splitLine = stripLine.split() if len(splitLine) > 1 and splitLine[1] == '((': currentChunkNode = ChunkNode(line + '\n') currentChunkNode.upper = lastContext currentChunkNode.upper.nodeList.append(currentChunkNode) if currentChunkNode.upper.__class__.__name__ != 'Sentence': currentChunkNode.upper.text.append(line) lastContext = currentChunkNode elif len(splitLine) > 0 and splitLine[0] == '))': currentChunkNode.footer = line + '\n' currentChunkNode.analyzeChunk() lastContext = currentChunkNode.upper currentChunkNode = lastContext else: currentNode = Node(line + '\n') lastContext.nodeList.append(currentNode) currentNode.upper = lastContext # updateAttributesStatus = self.updateAttributes() # if updateAttributesStatus == 0 : # self.probsent = True # self.errors.append("Cannot update the Attributes for this sentence") def addEdge(self, parent, child): if parent in self.edges.iterkeys(): if child not in self.edges[parent]: self.edges[parent].append(child) else: self.edges[parent] = [child] def updateAttributes(self): populateNodesStatus = self.populateNodes() populateEdgesStatus = self.populateEdges() self.sentence = self.generateSentence() if populateEdgesStatus == 0 or populateNodesStatus == 0: return 0 return 1 def printSSFValue(self, allFeat): returnStringList = [] returnStringList.append("") if self.nodeList != []: nodeList = self.nodeList nodePosn = 0 for node in nodeList: nodePosn += 1 returnStringList.extend( node.printSSFValue(str(nodePosn), allFeat)) returnStringList.append('\n') return '\n'.join(x for x in returnStringList) def populateNodes(self, naming='strict'): if naming == 'strict': for nodeElement in self.nodeList: assert nodeElement.name is not None self.nodes[nodeElement.name] = nodeElement return 1 def populateEdges(self): for node in self.nodeList: nodeName = node.name if node.parent == '0' or node == self.rootNode: self.rootNode = node continue elif node.parent not in self.nodes.iterkeys(): # self.errors.append('Error : Bad DepRel Parent Name ' + self.fileName + ' : ' + str(self.name)) return 0 assert node.parent in self.nodes.iterkeys() self.addEdge(node.parent, node.name) return 1 def generateSentence(self): sentence = [] for nodeName in self.nodeList: sentence.append(nodeName.printValue()) return ' '.join(x for x in sentence) class Document(): def __init__(self, fileName): self.header = None self.footer = None self.text = None self.nodeList = [] self.fileName = fileName self.analyzeDocument() self.upper = None def analyzeDocument(self): inputFD = codecs.open(self.fileName, 'r', encoding='utf8') sentenceList = findSentences(inputFD) for sentence in sentenceList: tree = Sentence(sentence[1], ignoreErrors=True, nesting=True) tree.text = sentence[1] tree.sentenceID = int(sentence[0]) tree.footer = sentence[2] tree.header = " "Sentence" -> "ChunkNode" -> "Node" "Relative" value starts the base address from the node which contains the address. This is also the default option. ''' currentContext = node if level != 'Relative': while(currentContext.__class__.__name__ != level): currentContext = currentContext.upper currentContext = currentContext.upper stepList = address.split('%') for step in stepList: if step == '..': currentContext = currentContext.upper else: refNode = [ iterNode for iterNode in currentContext.nodeList if iterNode.name == step][0] currentContext = refNode return refNode def getChunkFeats(line): lineList = line.strip().split() returnErrors = list() chunkType = None fsList = [] if len(lineList) >= 3: chunkType = lineList[2] returnFeats = OrderedDict() multipleFeatRE = r'' featRE = r'(?:\W*)(\S+)=([\'|\"])?([^ \t\n\r\f\v\'\"]*)[\'|\"]?(?:.*)' fsList = re.findall(multipleFeatRE, ' '.join(lineList)) for x in lineList: feat = re.findall(featRE, x) if feat != []: if len(feat) > 1: returnErrors.append('Feature with more than one value') continue returnFeats[feat[0][0]] = feat[0][2] return [chunkType, returnFeats, fsList] def getTokenFeats(lineList): tokenType, token = None, None returnFeats = OrderedDict() fsList = [] if len(lineList) >= 3: tokenType = lineList[2] returnErrors = list() token = lineList[1] multipleFeatRE = r'' featRE = r'(?:\W*)(\S+)=([\'|\"])?([^ \t\n\r\f\v\'\"]*)[\'|\"]?(?:.*)' fsList = re.findall(multipleFeatRE, ' '.join(lineList)) for x in lineList: feat = re.findall(featRE, x) if feat != []: if len(feat) > 1: returnErrors.append('Feature with more than one value') continue returnFeats[feat[0][0]] = feat[0][2] return [token, tokenType, returnFeats, fsList] def getSentenceIter(inpFD): sentenceRE = r'''(?P(?P
\d+)[\'\"]?>)(?P.*?)(?P