GetShallowParserOutput.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by

@author: priyank
'''

import json
import requests
from SocketServer import ThreadingMixIn
import threading
import codecs
import re
import cgi
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
from optparse import OptionParser
from urlparse import urlparse, parse_qs
import os
import sys
from argparse import ArgumentParser

#configuring commandline parser and check if the all command line parameters are valid
parser=ArgumentParser()
parser.add_argument('-c', '--serverConfigFile', help='server configuration file (with path)', required=True)
parser.add_argument('-i', '--inputFile', help='inputFile (with path)', required=True)
args = parser.parse_args()

#getting command line config files and check if files exist
serverFile = args.serverConfigFile
inputFile = args.inputFile

#function to get sentences from SSF
def sentenceCollector(inputString):
    if "Sentence><Sentence" in inputString:
	
	inputString = inputString.replace('Sentence><Sentence', 'Sentence>\n<Sentence')
    
    inArray = inputString.strip().split("\n")
    sentList = []
    tempString = ""
    for line in inArray:
	line = line.rstrip()
	if line:
	    if line.startswith('<Sentence '):
		tempString = tempString + line + "\n"
	    elif line.startswith('</Sentence'):
		tempString = tempString + line + "\n"
		sentList.append(tempString)
		tempString = ""
	    else:
		tempString = tempString + line + "\n"
    return sentList

# Function to get output of lats module(wordgenerator)
def wordgenCollector(inputString):
    inArray = inputString.strip().split("\n")
    #sentList = []
    tempString = ""
    for line in inArray:
	line = line.rstrip()
	linearray = line.split("\t")
	if line and len(linearray) >=2:
	    if line.startswith('<Sentence '):
		continue
	    elif line.startswith('</Sentence'):
		continue
	    elif linearray[1] == '((' or linearray[1] == '))':
		continue
	    else:
		tempString = tempString + linearray[1] + " "
    return tempString

if not os.path.isfile(serverFile):
    print  " serverFile file", serverFile ,"does not exist."
    sys.exit(0);    
    
if not os.path.isfile(inputFile):
    print  " inputFile file", inputFile ,"does not exist."
    sys.exit(0);
 
server_details = {}
#getting server details
with open(serverFile) as server_file:
    server_details = json.load(server_file)

translationURL = server_details['hin']

f = codecs.open(inputFile, "rb", "utf-8")
lines = f.readlines()
f.close()

tokenizerURLArray = translationURL.split("/")
tokenizerURLArray[-2] = '1'
modulesURL = tokenizerURLArray[0] + "/" + tokenizerURLArray[1] + "/" + tokenizerURLArray[2] + "/" + tokenizerURLArray[5] + "/" + tokenizerURLArray[6] + "/modules"
			    
tokenizerURL = "/".join(tokenizerURLArray)

tokenizerURLArray = translationURL.split("/")
tokenizerURLArray[-3] = '2'
translationURL = "/".join(tokenizerURLArray)

myheaders = {"Content-type": "application/x-www-form-urlencoded; charset=UTF-8"}
proxies = {
        	"http" :None,
        	"https":None
}

res = requests.post(modulesURL, proxies=proxies, headers=myheaders)

lastModule = ''
secondLastModule = ''
# getting last modules
if res is not None:
    modulesList = json.loads(res.text)
    lastModule = modulesList[-1]
    secondLastModule = modulesList[-2]
else:
    print "Null response from server"
    sys.exit(0)


response_data = {}
response_data['language'] = 'hin'
response_data['text'] = lines
output= ""
wxoutput = ""

# processing sentence in each line by calling MT
# Processing paras: one line is considered as a para
iii = 0
intermediatearray = []
mystr = ""
for line in lines:
    line = line.strip()
    if line :
		# calling tokenizer on line
		dataToSend = {"data":line.strip().encode('utf-8')}
		
		res = requests.post(tokenizerURL, proxies=proxies, headers=myheaders, data=dataToSend)
				    
		tokenOut = json.loads(res.text)
				    
		sentences = sentenceCollector(tokenOut['tokenizer-1'])
		
		jjj = 0
		tempdict = {}
		mystr = mystr + "paraid:" + str((iii + 1)) + "\n" + line + "\n"
		for sentence in sentences:
			dataToSend = {"data":sentence.strip().encode('utf-8').strip()}
					
			res = requests.post(translationURL, proxies=proxies, headers=myheaders, data=dataToSend)
			completeOut = json.loads(res.text)
			lastmoduleOutput = completeOut[lastModule+"-"+str((modulesList.index(lastModule))+1)]
			secondlastmoduleOutput = completeOut[secondLastModule+"-"+str((modulesList.index(secondLastModule))+1)]
			finalOutput = lastmoduleOutput
			output = output + finalOutput + " \n\n"
			wxoutput = wxoutput + secondlastmoduleOutput + " \n\n"

			mystr = mystr + "sentid:" + str((jjj + 1)) + "\n" + line + "\n"
			mystr = mystr + lastmoduleOutput + "\n"
			
			jjj = jjj + 1
	    	iii = iii + 1
	   	output = output + " \n\n"
	    	wxoutput = wxoutput + " \n\n"
		mystr = mystr + "---------------------------------------------------------\n"

	    	
print mystr