#!/usr/bin/env python import codecs, sys, string, re, os from argparse import ArgumentParser import socket, threading import StringIO _MAX_BUFFER_SIZE_ = 102400 #100KB #configuring commandline parser and check if the all command line parameters are valid parser=ArgumentParser() parser.add_argument("-d", "--dictFile", help="lexicon dictionary file (with path)", required=True) parser.add_argument('-p', '--paradigmFile', help="paradigm file (with path)", required=True) parser.add_argument("-s", "--suffixFile", help="suffix info file (with path)", required=True) parser.add_argument('-f', '--featureFile', help="features file (with path)", required=True) parser.add_argument('-e', '--exceptionFile', help="exception words file (with path)", required=True) parser.add_argument('-port', '--port', help="port", required=True) args = parser.parse_args() #getting command line arguments and check if they exist dictFile = args.dictFile paradigmFile = args.paradigmFile suffixFile = args.suffixFile featureFile = args.featureFile exceptionFile = args.exceptionFile port = args.port if dictFile is None: print (" Please enter valid lexicon dictionary.") sys.exit(0); if not os.path.isfile(dictFile): print( " Lexicon dictFile ", dictFile ,"does not exist." ) sys.exit(0); if paradigmFile is None: print (" Please enter valid paradigm file.") sys.exit(0); if not os.path.isfile(paradigmFile): print( " Paradigm file ", paradigmFile ,"does not exist." ) sys.exit(0); if suffixFile is None: print (" Please enter valid suffix file.") sys.exit(0); if not os.path.isfile(suffixFile): print( " suffix file ", suffixFile ,"does not exist." ) sys.exit(0); if featureFile is None: print (" Please enter valid feature file.") sys.exit(0); if not os.path.isfile(featureFile): print( " feature file ", featureFile ,"does not exist." ) sys.exit(0); if exceptionFile is None: print (" Please enter valid exception file.") sys.exit(0); if not os.path.isfile(exceptionFile): print( " exception file ", exceptionFile ,"does not exist." ) sys.exit(0); if port is None: print (" Please enter valid port.") sys.exit(0); # reading dictionary dict_file = dictFile f = codecs.open(dict_file, "rb", "utf-8") lines = f.readlines() f.close() dict_map = {} for line in lines: line = line.strip() dict_key = line.split(",")[0] linearray = ",".join(line.split(",")[1:]).split("#") for lineelement in linearray: lineelementarray = lineelement.split(",") dict_value = lineelementarray[1] + "____" + lineelementarray[2] if dict_key in dict_map: dict_map[dict_key].append(dict_value) #dict_map[dict_key] = tempval else: templist = [dict_value] dict_map[dict_key] = templist # reading suffix file suff_file = suffixFile f = codecs.open(suff_file, "rb", "utf-8") lines = f.readlines() f.close() suff_map = {} for line in lines: line = line.strip() if line: linearray = line.split("\t") mykey = linearray[0][1:-1] myvalue = linearray[1][1:-1].split("#") suff_map[mykey] = myvalue # reading exceptions dictionary file except_file = exceptionFile f = codecs.open(except_file, "rb", "utf-8") lines = f.readlines() f.close() except_map = {} for line in lines: line = line.strip() if line: linearray = line.split(":") mykey = linearray[0] myvalue = "<" + linearray[1] myvalue = myvalue.replace("af=", "fs af=") except_map[mykey] = myvalue #print (suff_map["0"]) # reading paradigms dictionary pdgm_file = paradigmFile f = codecs.open(pdgm_file, "rb", "utf-8") pdgmlines = f.read() f.close() pdgmlines = pdgmlines.strip() + "\n" # reading features file ftrfile = featureFile f = codecs.open(ftrfile, "rb", "utf-8") featurelines = f.readlines() f.close() # function to get suffix and modified word def getsuffix(word, length): if length == 0: return word, '0' elif length == len(word): return '', word[::-1] else: suff = word[-length:][::-1] modword = word[0:len(word)-length] return modword, suff # function to search suffix in suffix table def searchSuffixInSuffixTable(suff, suffdict): if suff in suffdict: return suffdict[suff] else: return [] # function to search word in exception words list table def searchExceptionDict(except_map, word): if word in except_map: return 1, except_map[word] else: return 0, "" # function to search word in lexicon dictionary def searchInDictionay(word, dbmdict): if word in dbmdict: return dbmdict[word] else: return [] # heuristics to search in feature def searchInFeaturesList(inputDict, featuresList): outputDict = {} featueLine = featuresList[(inputDict["offset"] -1)].strip() #print featueLine featureLineArray = featueLine.split(" ") outputDict["cat"] = featureLineArray[0] outputDict["root"] = inputDict["word"] if len(featureLineArray) > 1: updatedArray = featureLineArray[1:] for ii in xrange(0, len(updatedArray), 2): outputDict[updatedArray[ii]] = updatedArray[(ii+1)] if outputDict["cat"] == 'n' and (not 'tam' in outputDict): outputDict['tam'] = '0' if 'person1' in outputDict: outputDict['person'] = outputDict['person1'] if outputDict['cat'] == "v" or outputDict['cat'] == "pn": if 'tam' in outputDict or 'parsarg' in outputDict: if 'tam' in outputDict: outputDict['cm'] = outputDict['tam'] else: outputDict['cm'] = outputDict['parsarg'] else: if 'parsarg' in outputDict: outputDict['tam'] = outputDict['parsarg'] if 'tam' in outputDict: outputDict['cm'] = outputDict['tam'] if 'tam' in outputDict or 'parsarg' in outputDict: if 'tam' in outputDict: outputDict['tam'] = outputDict['tam'] else: outputDict['tam'] = outputDict['parsarg'] if outputDict['cat'] == 'sh_P': outputDict['cat'] = 'pn' if outputDict['cat'] == 'sh_n': outputDict['cat'] = 'psp' if outputDict['cat'] == 'n' and (not 'person' in outputDict): outputDict['person'] = '3' return outputDict # function to build feature strring def buildFeatureString(featureDict): root = "" cat = "" gender = "" number = "" person = "" case = "" cm = "" tam = "" emph = "" gen1 = "" num1 = "" cas1 = "" root = featureDict['root'] cat = featureDict['cat'] if 'gender' in featureDict: gender = featureDict['gender'] else: gender = "" if 'number' in featureDict: number = featureDict['number'] else: number = "" if 'person' in featureDict: person = featureDict['person'] else: person = "" if 'case' in featureDict: case = featureDict['case'] else: case = "" if 'cm' in featureDict: cm = featureDict['cm'] else: cm = "" if 'tam' in featureDict: tam = featureDict['tam'] else: tam = "" if 'emph' in featureDict: emph = featureDict['emph'] else: emph = "" if 'gen1' in featureDict: gen1 = featureDict['gen1'] else: gen1 = "" if 'num1' in featureDict: num1 = featureDict['num1'] else: num1 = "" if 'cas1' in featureDict: cas1 = featureDict['cas1'] else: cas1 = "" if (person == "2h" or person == "3h") and (gen1 == 'm' or gen1 == 'f'): person = person[0] af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'" fs = "" elif (person == "2h" or person == "3h") and (emph == 'y'): person = person[0] af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'" fs = "" elif (person == "2h" or person == "3h"): person = person[0] af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'" fs = "" elif (emph == 'y') and (gen1 == 'm' or gen1 == 'f'): af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'" fs = "" elif (gen1 == 'm' or gen1 == 'f'): af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'" fs = "" elif (emph == 'y'): af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'" fs = "" else: af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'" fs = "" return fs # main morph engine def computeMorph(word, suff_map, dict_map, featurelines): morphOutput = [] wordlength = len(word) for i in xrange(0, wordlength+1): wordFeatureDict = {} #print "i...................", i modifiedword, suff = getsuffix(word, i) #print (modifiedword, suff) # search suffix in the suffix table suffTableEntry = searchSuffixInSuffixTable(suff, suff_map) # found in suffix table if len(suffTableEntry) > 0: # iterating in suffix table entries for suffTableElement in suffTableEntry: suffTableArray = suffTableElement.split(",") pointerToTheTable = int(suffTableArray[1]) noOfEntries = int(suffTableArray[2]) add = suffTableArray[0] updatedroot = modifiedword + add #search in dbm dictionary dictEntries = searchInDictionay(updatedroot, dict_map) #print updatedroot, dictEntries # found in dict table if len(dictEntries) > 0: # iterating for number of entries for j in xrange(0, noOfEntries): pointerToTheEnd = pdgmlines[pointerToTheTable:].find("\n") pdgmEntry = pdgmlines[pointerToTheTable: (pointerToTheTable + pointerToTheEnd)] pdgmEntryArray = pdgmEntry.split(",") pdgmPdgmTable = pdgmEntryArray[0] catPdgmTable = pdgmEntryArray[1] offsetPdgmTable = int(pdgmEntryArray[2]) pointerToTheTable = pointerToTheTable + pointerToTheEnd + 1 tempDict = {} # matching in dictionary entries if pdgmPdgmTable + "____" + catPdgmTable in dictEntries: #print "MATCHED" tempDict['word'] = updatedroot tempDict['pdgm'] = pdgmPdgmTable tempDict['cat'] = catPdgmTable tempDict['offset'] = offsetPdgmTable fetchedFeatures = searchInFeaturesList(tempDict, featurelines) #print fetchedFeatures, suff morphFeature = buildFeatureString(fetchedFeatures) morphOutput.append(morphFeature) if len(morphOutput) > 0: return 1, "|".join(morphOutput) else: return 0, "" def ispunct(ch): return ch in string.punctuation # function to get spelling variations def getSpellVariations(word): variedList = [] if "Z" in word: # replace Z with null updatedWord = word.replace("Z", "") variedList.append(updatedWord) if "M" in word: # replace M with z updatedWord = word.replace("M", "z") variedList.append(updatedWord) if "Q" in word: # replace Q with q updatedWord = word.replace("Q", "q") variedList.append(updatedWord) # replace Q with q updatedWord = word + "H" variedList.append(updatedWord) if len(word) >= 2: # a replace if re.match(r"(.*)([kctwpKCTWPgjdxbGJDXBfFNnmyrlvsSRh])a$", word): matchObj = re.match(r"(.*)([kctwpKCTWPgjdxbGJDXBfFNnmyrlvsSRh])a$", word) if matchObj: updatedWord = matchObj.group(1) + matchObj.group(2) variedList.append(updatedWord) if len(word) >= 2: #y_insert if re.match(r"(.*)([aeiouAEIOU])([eiI])$", word): matchObj = re.match(r"(.*)([aeiouAEIOU])([eiI])$", word) if matchObj: updatedWord = matchObj.group(1) + matchObj.group(2) + "y" + matchObj.group(3) variedList.append(updatedWord) if len(word) >= 2: #y_insert if re.match(r"(.*)([EeoOiI])M(.*)", word): matchObj = re.match(r"(.*)([EeoOiI])M(.*)", word) if matchObj: updatedWord = matchObj.group(1) + matchObj.group(2) + "z" + matchObj.group(3) variedList.append(updatedWord) # M replace with z end if word.endswith("iyaM"): updatedWord = word updatedWord[len(updatedWord-1)] = 'z' variedList.append(updatedWord) # M replace with z end if len(word) >= 2: # a replace if re.match(r"(.*)M([kKgG])(.*)", word): matchObj = re.match(r"(.*)M([kKgG])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "f" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)M([cCjJ])(.*)", word): matchObj = re.match(r"(.*)M([cCjJ])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "F" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)M([tTdD])(.*)", word): matchObj = re.match(r"(.*)M([tTdD])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "N" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)M([wWxX])(.*)", word): matchObj = re.match(r"(.*)M([wWxX])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "n" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)M([pPbB])(.*)", word): matchObj = re.match(r"(.*)M([pPbB])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "m" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) #print word if re.match(r"(.*)f([kKgG])(.*)", word): matchObj = re.match(r"(.*)f([kKgG])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)F([cCjJ])(.*)", word): matchObj = re.match(r"(.*)F([cCjJ])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)N([tTdD])(.*)", word): matchObj = re.match(r"(.*)N([tTdD])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)n([wWxX])(.*)", word): matchObj = re.match(r"(.*)n([wWxX])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) if re.match(r"(.*)m([pPbB])(.*)", word): matchObj = re.match(r"(.*)m([pPbB])(.*)", word) if matchObj: updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3) variedList.append(updatedWord) return variedList # function to compute Morph of the variation of the words def computeMorphOfVariation(word, suff_map, dict_map, featurelines): combinedOut = [] wordsVariationList = getSpellVariations(word) #print wordsVariationList if len(wordsVariationList) > 0: for item in wordsVariationList: status, morphOutput = computeMorph(item, suff_map, dict_map, featurelines) if status: combinedOut.append(morphOutput) if len(combinedOut) > 0: return 1, "|".join(combinedOut) else: return 0, "" def processInput(inputF): morphString = "" line = inputF.readline() while line: morphOutput = "" lineFeatureDictList = [] line = line.strip() if line: linearray = line.split("\t") word = linearray[1] if len(linearray) == 2: linearray[2] = "" if ispunct(word): if word == ",": morphOutput = "" elif word == "/": morphOutput = "" else: morphOutput = "" elif word.isdigit(): morphOutput = "" else: status, morphOutput = computeMorph(word, suff_map, dict_map, featurelines) if not status: variedStatus, morphOutput = computeMorphOfVariation(word, suff_map, dict_map, featurelines) if not variedStatus: exceptStatus, exceptFeature = searchExceptionDict(except_map, word) if exceptStatus: morphOutput = exceptFeature else: morphOutput = "" else: morphOutput = "" morphString = morphString + linearray[0] + "\t" + linearray[1] + "\t" + linearray[2] + "\t" + morphOutput + "\n" line = inputF.readline() return morphString.strip() class ClientThread(threading.Thread): def __init__(self,ip,port,clientsocket): threading.Thread.__init__(self) self.ip = ip self.port = port self.csocket = clientsocket #print "[+] New thread started for "+ip+":"+str(port) def run(self): #print "Connection from : "+ip+":"+str(port) data = self.csocket.recv(_MAX_BUFFER_SIZE_) #print "Client(%s:%s) sent : %s"%(self.ip, str(self.port), data) fakeFile = StringIO.StringIO(data) data = processInput(fakeFile) fakeFile.close() self.csocket.send(data) self.csocket.close() host = "0.0.0.0" #Listen on all interfaces port = eval(port) #Port number tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tcpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) tcpsock.bind((host,port)) while True: tcpsock.listen(4) #print "nListening for incoming connections..." (clientsock, (ip, port)) = tcpsock.accept() #pass clientsock to the ClientThread thread object being created newthread = ClientThread(ip, port, clientsock) newthread.start()