morph-socket-server.py

#!/usr/bin/env python

import codecs, sys, string, re, os
from argparse import ArgumentParser
import socket, threading
import StringIO

_MAX_BUFFER_SIZE_ = 102400 #100KB
#configuring commandline parser and check if the all command line parameters are valid
parser=ArgumentParser()
parser.add_argument("-d", "--dictFile", help="lexicon dictionary file (with path)", required=True)
parser.add_argument('-p', '--paradigmFile', help="paradigm file (with path)", required=True)
parser.add_argument("-s", "--suffixFile", help="suffix info file (with path)", required=True)
parser.add_argument('-f', '--featureFile', help="features file (with path)", required=True)
parser.add_argument('-e', '--exceptionFile', help="exception words file (with path)", required=True)
parser.add_argument('-port', '--port', help="port", required=True)
args = parser.parse_args()

#getting command line arguments and check if they exist
dictFile = args.dictFile
paradigmFile = args.paradigmFile
suffixFile = args.suffixFile
featureFile = args.featureFile
exceptionFile = args.exceptionFile
port = args.port


if dictFile is None:
	print  (" Please enter valid lexicon dictionary.")
	sys.exit(0); 

if not os.path.isfile(dictFile):
	print( " Lexicon dictFile ", dictFile ,"does not exist." )
	sys.exit(0);

if paradigmFile is None:
	print  (" Please enter valid paradigm file.")
	sys.exit(0); 

if not os.path.isfile(paradigmFile):
	print( " Paradigm file ", paradigmFile ,"does not exist." )
	sys.exit(0);

if suffixFile is None:
	print  (" Please enter valid suffix file.")
	sys.exit(0); 

if not os.path.isfile(suffixFile):
	print( " suffix file ", suffixFile ,"does not exist." )
	sys.exit(0);

if featureFile is None:
	print  (" Please enter valid feature file.")
	sys.exit(0); 

if not os.path.isfile(featureFile):
	print( " feature file ", featureFile ,"does not exist." )
	sys.exit(0);

if exceptionFile is None:
	print  (" Please enter valid exception file.")
	sys.exit(0); 

if not os.path.isfile(exceptionFile):
	print( " exception file ", exceptionFile ,"does not exist." )
	sys.exit(0);

if port is None:
	print  (" Please enter valid port.")
	sys.exit(0); 


# reading dictionary 
dict_file = dictFile
f = codecs.open(dict_file, "rb", "utf-8")
lines = f.readlines()
f.close()

dict_map = {}
for line in lines:
	line = line.strip()

	dict_key = line.split(",")[0]
	linearray = ",".join(line.split(",")[1:]).split("#")

	for lineelement in linearray:
		lineelementarray = lineelement.split(",")

		dict_value = lineelementarray[1] + "____" + lineelementarray[2]

		if dict_key in dict_map:
			dict_map[dict_key].append(dict_value)
			#dict_map[dict_key] = tempval
		else:
			templist = [dict_value]
			dict_map[dict_key] = templist


# reading suffix file
suff_file = suffixFile
f = codecs.open(suff_file, "rb", "utf-8")
lines = f.readlines()
f.close()

suff_map = {}
for line in lines:
	line = line.strip()
	if line:
		linearray = line.split("\t")
		mykey = linearray[0][1:-1]
		myvalue = linearray[1][1:-1].split("#")
		suff_map[mykey] = myvalue


# reading exceptions dictionary file
except_file = exceptionFile
f = codecs.open(except_file, "rb", "utf-8")
lines = f.readlines()
f.close()


except_map = {}
for line in lines:
	line = line.strip()
	if line:
		linearray = line.split(":")
		mykey = linearray[0]
		myvalue = "<" + linearray[1]
		myvalue = myvalue.replace("af=", "fs af=")
		except_map[mykey] = myvalue

	
#print (suff_map["0"])
# reading paradigms dictionary 
pdgm_file = paradigmFile
f = codecs.open(pdgm_file, "rb", "utf-8")
pdgmlines = f.read()
f.close()
pdgmlines = pdgmlines.strip() + "\n"


# reading features file 
ftrfile = featureFile
f = codecs.open(ftrfile, "rb", "utf-8")
featurelines = f.readlines()
f.close()


# function to get suffix and modified word
def getsuffix(word, length):
	if length == 0:
		return word, '0'
	elif length == len(word):
		return '', word[::-1]
	else:
		suff = word[-length:][::-1]
		modword = word[0:len(word)-length]
		return modword, suff

# function to search suffix in suffix table
def searchSuffixInSuffixTable(suff, suffdict):
	if suff in suffdict:
		return suffdict[suff]
	else:
		return []

# function to search word in exception words list table
def searchExceptionDict(except_map, word):
	if word in except_map:
		return 1, except_map[word]
	else:
		return 0, ""

# function to search word in lexicon dictionary
def searchInDictionay(word, dbmdict):
	if word in dbmdict:
		return dbmdict[word]
	else:
		return []

# heuristics to search in feature
def searchInFeaturesList(inputDict, featuresList):
	outputDict = {}
	featueLine = featuresList[(inputDict["offset"] -1)].strip()
	#print featueLine
	featureLineArray = featueLine.split(" ")
	outputDict["cat"] = featureLineArray[0]
	outputDict["root"] = inputDict["word"]	

	if len(featureLineArray) > 1:
		updatedArray = featureLineArray[1:]

		for ii in xrange(0, len(updatedArray), 2):
			outputDict[updatedArray[ii]] = updatedArray[(ii+1)]

	if outputDict["cat"] == 'n' and (not 'tam' in outputDict):
		outputDict['tam'] = '0'

	if 'person1' in outputDict:
		outputDict['person'] = outputDict['person1']

	if outputDict['cat'] == "v" or outputDict['cat'] == "pn":
		if 'tam' in outputDict or 'parsarg' in outputDict:
			if 'tam' in outputDict:
				outputDict['cm'] = outputDict['tam']
			else:
				 outputDict['cm'] = outputDict['parsarg']
	else:
		if 'parsarg' in outputDict:
			outputDict['tam'] = outputDict['parsarg']
		if 'tam' in outputDict:
			outputDict['cm'] = outputDict['tam']

	if 'tam' in outputDict or 'parsarg' in outputDict:
		if 'tam' in outputDict:
			outputDict['tam'] = outputDict['tam']
		else:
			 outputDict['tam'] = outputDict['parsarg']

	if outputDict['cat'] == 'sh_P':
		outputDict['cat'] = 'pn'
	if outputDict['cat'] == 'sh_n':
		outputDict['cat'] = 'psp'
	if outputDict['cat'] == 'n' and (not 'person' in outputDict):
		outputDict['person'] = '3'
	return outputDict

# function to build feature strring
def buildFeatureString(featureDict):
	root = ""
	cat = ""
	gender = ""
	number = ""
	person = ""
	case = ""
	cm = ""
	tam = ""
	emph = ""
	gen1 = ""
	num1 = ""
	cas1 = ""

	root = featureDict['root']
	cat = featureDict['cat']
	if 'gender' in featureDict:
		gender = featureDict['gender']
	else:
		gender = ""

	if 'number' in featureDict:
		number = featureDict['number']
	else:
		number = ""

	if 'person' in featureDict:
		person = featureDict['person']
	else:
		person = ""

	if 'case' in featureDict:
		case = featureDict['case']
	else:
		case = ""

	if 'cm' in featureDict:
		cm = featureDict['cm']
	else:
		cm = ""

	if 'tam' in featureDict:
		tam = featureDict['tam']
	else:
		tam = ""

	if 'emph' in featureDict:
		emph = featureDict['emph']
	else:
		emph = ""

	if 'gen1' in featureDict:
		gen1 = featureDict['gen1']
	else:
		gen1 = ""

	if 'num1' in featureDict:
		num1 = featureDict['num1']
	else:
		num1 = ""

	if 'cas1' in featureDict:
		cas1 = featureDict['cas1']
	else:
		cas1 = ""

	if (person == "2h" or person == "3h") and (gen1 == 'm' or gen1 == 'f'):
		person = person[0]
		af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'"
		fs = "<fs " + af + " agr_gen='" + gen1 + "' agr_num='" + num1 + "' agr_cas='" + cas1 + "' hon='y'>"

	elif (person == "2h" or person == "3h") and (emph == 'y'):
		person = person[0]
		af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'"
		fs = "<fs " + af + " emph='" + emph + "' hon='y'>"

	elif (person == "2h" or person == "3h"):
		person = person[0]
		af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'"
		fs = "<fs " + af + " hon='y'>"

	elif (emph == 'y') and (gen1 == 'm' or gen1 == 'f'):
		af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'"
		fs = "<fs " + af + " emph='" + emph + "' agr_gen='" + gen1 + "' agr_num='" + num1 + "' agr_cas='" + cas1 + "'>"

	elif (gen1 == 'm' or gen1 == 'f'):
		af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'"
		fs = "<fs " + af + " agr_gen='" + gen1 + "' agr_num='" + num1 + "' agr_cas='" + cas1 + "'>"

	elif (emph == 'y'):
		af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'"
		fs = "<fs " + af + " emph='" + emph + "'>"

	else:
		af = "af='" + root + "," + cat + "," + gender + "," + number + "," + person + "," + case + "," + cm + "," + tam + "'"
		fs = "<fs " + af + ">"

	return fs

# main morph engine 
def computeMorph(word, suff_map, dict_map, featurelines):
	morphOutput = []
	wordlength = len(word)
	for i in xrange(0, wordlength+1):
		wordFeatureDict = {}
		#print "i...................", i
		modifiedword, suff = getsuffix(word, i)

		#print (modifiedword, suff)

		# search suffix in the suffix table
		suffTableEntry = searchSuffixInSuffixTable(suff, suff_map)

		# found in suffix table			
		if len(suffTableEntry) > 0:

			# iterating in suffix table entries
			for suffTableElement in suffTableEntry:

				suffTableArray = suffTableElement.split(",")					
				pointerToTheTable = int(suffTableArray[1])
				noOfEntries = int(suffTableArray[2])
				add = suffTableArray[0]

				updatedroot = modifiedword + add

				#search in dbm dictionary
				dictEntries = searchInDictionay(updatedroot, dict_map)

				#print updatedroot, dictEntries

				# found in dict table	
				if len(dictEntries) > 0:
					# iterating for number of entries
					for j in xrange(0, noOfEntries):

						pointerToTheEnd = pdgmlines[pointerToTheTable:].find("\n")

						pdgmEntry = pdgmlines[pointerToTheTable: (pointerToTheTable + pointerToTheEnd)]

						pdgmEntryArray = pdgmEntry.split(",")

						pdgmPdgmTable = pdgmEntryArray[0]
						catPdgmTable = pdgmEntryArray[1]
						offsetPdgmTable = int(pdgmEntryArray[2])

						pointerToTheTable = pointerToTheTable + pointerToTheEnd + 1

						tempDict = {}
						# matching in dictionary entries
						if pdgmPdgmTable + "____" + catPdgmTable in dictEntries:
							#print "MATCHED"
							tempDict['word'] = updatedroot
							tempDict['pdgm'] = pdgmPdgmTable
							tempDict['cat'] = catPdgmTable
							tempDict['offset'] = offsetPdgmTable

							fetchedFeatures = searchInFeaturesList(tempDict, featurelines)

							#print fetchedFeatures, suff

							morphFeature = buildFeatureString(fetchedFeatures)
							morphOutput.append(morphFeature)

	if len(morphOutput) > 0:
		return 1, "|".join(morphOutput)
	else:
		return 0, ""

def ispunct(ch):
    return ch in string.punctuation

# function to get spelling variations
def getSpellVariations(word):
	variedList = []
	if "Z" in word:
		# replace Z with null
		updatedWord = word.replace("Z", "")
		variedList.append(updatedWord)
	if "M" in word:
		# replace M with z
		updatedWord = word.replace("M", "z")
		variedList.append(updatedWord)
	if "Q" in word:
		# replace Q with q
		updatedWord = word.replace("Q", "q")
		variedList.append(updatedWord) 
	# replace Q with q
	updatedWord = word + "H"
	variedList.append(updatedWord)

	if len(word) >= 2: # a replace
		if re.match(r"(.*)([kctwpKCTWPgjdxbGJDXBfFNnmyrlvsSRh])a$", word):
			matchObj = re.match(r"(.*)([kctwpKCTWPgjdxbGJDXBfFNnmyrlvsSRh])a$", word)
			if matchObj:
				updatedWord = matchObj.group(1) + matchObj.group(2)
				variedList.append(updatedWord)

	if len(word) >= 2: #y_insert
		if re.match(r"(.*)([aeiouAEIOU])([eiI])$", word):
			matchObj = re.match(r"(.*)([aeiouAEIOU])([eiI])$", word)
			if matchObj:
				updatedWord = matchObj.group(1) + matchObj.group(2) + "y" + matchObj.group(3)
				variedList.append(updatedWord)

	if len(word) >= 2: #y_insert
		if re.match(r"(.*)([EeoOiI])M(.*)", word):
			matchObj = re.match(r"(.*)([EeoOiI])M(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + matchObj.group(2) + "z" + matchObj.group(3)
				variedList.append(updatedWord)

	# M replace with z end
	if word.endswith("iyaM"):
		updatedWord = word
		updatedWord[len(updatedWord-1)] = 'z'
		variedList.append(updatedWord)

	# M replace with z end
	if len(word) >= 2: # a replace
		if re.match(r"(.*)M([kKgG])(.*)", word):
			matchObj = re.match(r"(.*)M([kKgG])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "f" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)

		if re.match(r"(.*)M([cCjJ])(.*)", word):
			matchObj = re.match(r"(.*)M([cCjJ])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "F" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)
		if re.match(r"(.*)M([tTdD])(.*)", word):
			matchObj = re.match(r"(.*)M([tTdD])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "N" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)
		if re.match(r"(.*)M([wWxX])(.*)", word):
			matchObj = re.match(r"(.*)M([wWxX])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "n" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)

		if re.match(r"(.*)M([pPbB])(.*)", word):
			matchObj = re.match(r"(.*)M([pPbB])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "m" + matchObj.group(2) + matchObj.group(3) 
				variedList.append(updatedWord)
		#print word
		if re.match(r"(.*)f([kKgG])(.*)", word):
			matchObj = re.match(r"(.*)f([kKgG])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)

		if re.match(r"(.*)F([cCjJ])(.*)", word):
			matchObj = re.match(r"(.*)F([cCjJ])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)
		if re.match(r"(.*)N([tTdD])(.*)", word):
			matchObj = re.match(r"(.*)N([tTdD])(.*)", word)
			if matchObj: 
				updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)
		if re.match(r"(.*)n([wWxX])(.*)", word):
			matchObj = re.match(r"(.*)n([wWxX])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)
		if re.match(r"(.*)m([pPbB])(.*)", word):
			matchObj = re.match(r"(.*)m([pPbB])(.*)", word)
			if matchObj:
				updatedWord = matchObj.group(1) + "M" + matchObj.group(2) + matchObj.group(3)
				variedList.append(updatedWord)
	return variedList

# function to compute Morph of the variation of the words
def computeMorphOfVariation(word, suff_map, dict_map, featurelines):
	combinedOut = []
	wordsVariationList = getSpellVariations(word)
	#print wordsVariationList
	if len(wordsVariationList) > 0:
		for item in wordsVariationList:
			status, morphOutput = computeMorph(item, suff_map, dict_map, featurelines)
			if status:
				combinedOut.append(morphOutput)
	if len(combinedOut) > 0:
		return 1, "|".join(combinedOut)
	else:
		return 0, ""


def processInput(inputF):
	morphString = ""
	line = inputF.readline()
	while line:
		morphOutput = ""
		lineFeatureDictList = []
		line = line.strip()

		if line:
			linearray = line.split("\t")
			word = linearray[1]

			if len(linearray) == 2:
				linearray[2] = ""

			if ispunct(word):
				if word == ",":
					morphOutput = "<fs af='&comma,punc,,,,,,'>"
				elif word == "/":
					morphOutput = "<fs af='&slash,punc,,,,,,'>"
				else:
					morphOutput = "<fs af='"+word+",punc,,,,,,'>"

			elif word.isdigit():
				morphOutput = "<fs af='"+word+",num,,,,,,'>"

			else:
				status, morphOutput = computeMorph(word, suff_map, dict_map, featurelines)

				if not status:
					variedStatus, morphOutput = computeMorphOfVariation(word, suff_map, dict_map, featurelines)
					if not variedStatus:
						exceptStatus, exceptFeature = searchExceptionDict(except_map, word)
						if exceptStatus:
							morphOutput = exceptFeature
						else:
							morphOutput = "<fs af='"+word+",unk,,,,,,'>"
					else:
						morphOutput = "<fs af='"+word+",unk,,,,,,'>"
			morphString = morphString + linearray[0] + "\t" + linearray[1] + "\t" + linearray[2] + "\t" + morphOutput + "\n"
		line = inputF.readline()
	return morphString.strip()


class ClientThread(threading.Thread):

    def __init__(self,ip,port,clientsocket):
        threading.Thread.__init__(self)
        self.ip = ip
        self.port = port
        self.csocket = clientsocket
        #print "[+] New thread started for "+ip+":"+str(port)

    def run(self):
        #print "Connection from : "+ip+":"+str(port)

        data = self.csocket.recv(_MAX_BUFFER_SIZE_)
        #print "Client(%s:%s) sent : %s"%(self.ip, str(self.port), data)
        fakeFile = StringIO.StringIO(data)
        data = processInput(fakeFile)
        fakeFile.close()
        self.csocket.send(data)
        self.csocket.close()

host = "0.0.0.0" #Listen on all interfaces
port = eval(port) #Port number

tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
tcpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

tcpsock.bind((host,port))

while True:
    tcpsock.listen(4)
    #print "nListening for incoming connections..."
    (clientsock, (ip, port)) = tcpsock.accept()

    #pass clientsock to the ClientThread thread object being created
    newthread = ClientThread(ip, port, clientsock)
    newthread.start()