GetShallowParserOutput.py 4.82 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by

@author: priyank
'''

import json
import requests
from SocketServer import ThreadingMixIn
import threading
import codecs
import re
import cgi
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
from optparse import OptionParser
from urlparse import urlparse, parse_qs
import os
import sys
from argparse import ArgumentParser

#configuring commandline parser and check if the all command line parameters are valid
parser=ArgumentParser()
parser.add_argument('-c', '--serverConfigFile', help='server configuration file (with path)', required=True)
parser.add_argument('-i', '--inputFile', help='inputFile (with path)', required=True)
args = parser.parse_args()

#getting command line config files and check if files exist
serverFile = args.serverConfigFile
inputFile = args.inputFile

#function to get sentences from SSF
def sentenceCollector(inputString):
    if "Sentence><Sentence" in inputString:
	
	inputString = inputString.replace('Sentence><Sentence', 'Sentence>\n<Sentence')
    
    inArray = inputString.strip().split("\n")
    sentList = []
    tempString = ""
    for line in inArray:
	line = line.rstrip()
	if line:
	    if line.startswith('<Sentence '):
		tempString = tempString + line + "\n"
	    elif line.startswith('</Sentence'):
		tempString = tempString + line + "\n"
		sentList.append(tempString)
		tempString = ""
	    else:
		tempString = tempString + line + "\n"
    return sentList

# Function to get output of lats module(wordgenerator)
def wordgenCollector(inputString):
    inArray = inputString.strip().split("\n")
    #sentList = []
    tempString = ""
    for line in inArray:
	line = line.rstrip()
	linearray = line.split("\t")
	if line and len(linearray) >=2:
	    if line.startswith('<Sentence '):
		continue
	    elif line.startswith('</Sentence'):
		continue
	    elif linearray[1] == '((' or linearray[1] == '))':
		continue
	    else:
		tempString = tempString + linearray[1] + " "
    return tempString

if not os.path.isfile(serverFile):
    print  " serverFile file", serverFile ,"does not exist."
    sys.exit(0);    
    
if not os.path.isfile(inputFile):
    print  " inputFile file", inputFile ,"does not exist."
    sys.exit(0);
 
server_details = {}
#getting server details
with open(serverFile) as server_file:
    server_details = json.load(server_file)

translationURL = server_details['urd']

f = codecs.open(inputFile, "rb", "utf-8")
lines = f.readlines()
f.close()

tokenizerURLArray = translationURL.split("/")
tokenizerURLArray[-2] = '1'
modulesURL = tokenizerURLArray[0] + "/" + tokenizerURLArray[1] + "/" + tokenizerURLArray[2] + "/" + tokenizerURLArray[5] + "/" + tokenizerURLArray[6] + "/modules"
			    
tokenizerURL = "/".join(tokenizerURLArray)

tokenizerURLArray = translationURL.split("/")
tokenizerURLArray[-3] = '2'
translationURL = "/".join(tokenizerURLArray)

myheaders = {"Content-type": "application/x-www-form-urlencoded; charset=UTF-8"}
proxies = {
        	"http" :None,
        	"https":None
}

res = requests.post(modulesURL, proxies=proxies, headers=myheaders)

lastModule = ''
secondLastModule = ''
# getting last modules
if res is not None:
    modulesList = json.loads(res.text)
    lastModule = modulesList[-1]
    secondLastModule = modulesList[-2]
else:
    print "Null response from server"
    sys.exit(0)


response_data = {}
response_data['language'] = 'urd'
response_data['text'] = lines
output= ""
wxoutput = ""

# processing sentence in each line by calling MT
# Processing paras: one line is considered as a para
iii = 0
intermediatearray = []
mystr = ""
for line in lines:
    line = line.strip()
    if line :
		# calling tokenizer on line
		dataToSend = {"data":line.strip().encode('utf-8')}
		
		res = requests.post(tokenizerURL, proxies=proxies, headers=myheaders, data=dataToSend)
				    
		tokenOut = json.loads(res.text)
				    
		sentences = sentenceCollector(tokenOut['tokenizer-1'])
		
		jjj = 0
		tempdict = {}
		mystr = mystr + "paraid:" + str((iii + 1)) + "\n" + line + "\n"
		for sentence in sentences:
			dataToSend = {"data":sentence.strip().encode('utf-8').strip()}
					
			res = requests.post(translationURL, proxies=proxies, headers=myheaders, data=dataToSend)
			completeOut = json.loads(res.text)
			lastmoduleOutput = completeOut[lastModule+"-"+str((modulesList.index(lastModule))+1)]
			secondlastmoduleOutput = completeOut[secondLastModule+"-"+str((modulesList.index(secondLastModule))+1)]
			finalOutput = lastmoduleOutput
			output = output + finalOutput + " \n\n"
			wxoutput = wxoutput + secondlastmoduleOutput + " \n\n"

			mystr = mystr + "sentid:" + str((jjj + 1)) + "\n" + line + "\n"
			mystr = mystr + lastmoduleOutput + "\n"
			
			jjj = jjj + 1
	    	iii = iii + 1
	   	output = output + " \n\n"
	    	wxoutput = wxoutput + " \n\n"
		mystr = mystr + "---------------------------------------------------------\n"

	    	

print mystr