#! /usr/bin/env python import re import sys import socket import argparse import StringIO import threading from argparse import RawTextHelpFormatter from .ilp import wxConvert __name__ = "converter-indic" __doc__ = "python-converter-indic: Converts Indian languages to WX (ASCII) and vice-versa" __author__ = "Irshad Ahmad" __version__ = "1.0.3" __license__ = "MIT" __maintainer__ = "Irshad Ahmad" __email__ = "irshad.bhat@research.iiit.ac.in" __status__ = "Beta" __all__ = ["ilp", "wxILP", "ssf_reader", "main"] _MAX_BUFFER_SIZE_ = 102400 #100KB class ClientThread(threading.Thread): def __init__(self, ip, port, clientsocket, args, con): threading.Thread.__init__(self) self.ip = ip self.con = con self.port = port self.args = args self.csocket = clientsocket #print "[+] New thread started for "+ip+":"+str(port) def run(self): #print "Connection from : "+ip+":"+str(port) data = self.csocket.recv(_MAX_BUFFER_SIZE_) #print "Client(%s:%s) sent : %s"%(self.ip, str(self.port), data) fakeInputFile = StringIO.StringIO(data) fakeOutputFile = StringIO.StringIO("") processInput(fakeInputFile, fakeOutputFile, self.args, self.con) fakeInputFile.close() self.csocket.send(fakeOutputFile.getvalue()) fakeOutputFile.close() self.csocket.close() #print "Client at "+self.ip+" disconnected..." def processInput(ifp, ofp, args, con): if args.format_ == "ssf": if args.nested: sentences = re.finditer("(\s*\n.*?)\n(.*?)\)\)\s*\n", ifp.read(), re.S) else: sentences = re.finditer("()(.*?)", ifp.read(), re.S) for sid_sentence in sentences: sid = sid_sentence.group(1) sentence = sid_sentence.group(2).strip() ofp.write('%s\n' %sid) consen = con.convert(sentence) ofp.write('%s' %consen) if args.nested: ofp.write("\t))\n") ofp.write("\n\n") else: for line in ifp: line = con.convert(line) ofp.write(line) def main(): format_list = 'text ssf conll bio tnt'.split() languages = 'hin tel tam mal kan ben ori pan mar nep guj bod kok asm urd'.split() # help messages src_enc_help = "select input-file encoding [utf|wx]" #trg_enc_help = "select output-file encoding [utf|wx]" format_help = "select output format [text|ssf|conll|bio|tnt]" lang_help = """select language (3 letter ISO-639 code) Hindi : hin Telugu : tel Tamil : tam Malayalam : mal Kannada : kan Bengali : ben Oriya : ori Punjabi : pan Marathi : mar Nepali : nep Gujarati : guj Bodo : bod Konkani : kok Assamese : asm Urdu : urd""" ssf_help = "specify ssf-type [inter|intra] in case file format (--f) is ssf" # parse command line arguments parser = argparse.ArgumentParser(prog="converter-indic", description="wx-utf converter for Indian languages", formatter_class=RawTextHelpFormatter) parser.add_argument('--v', action="version", version="%(prog)s 1.0.3") parser.add_argument('--l', metavar='language', dest="lang", choices=languages, default="hin", help="%s" %lang_help) parser.add_argument('--s', metavar='source', dest="src_enc", choices=["utf","wx"], default="utf", help="%s" %src_enc_help) parser.add_argument('--f', metavar='format', dest="format_", choices=format_list, default="text", help="%s" %format_help) parser.add_argument('--t', metavar='ssf-type', dest="ssf_type", choices=["inter","intra"], default=None, help=ssf_help) parser.add_argument('--n', dest='nested', action='store_true', help="set this flag for nested ssf") parser.add_argument('--m', dest='mask', action='store_false', help="set this flag to keep off masking of roman strings in Indic text") parser.add_argument('--i', metavar='input', dest="infile", type=argparse.FileType('r'), default=sys.stdin, help="") parser.add_argument('--o', metavar='output', dest="outfile", type=argparse.FileType('w'), default=sys.stdout, help="") parser.add_argument('--daemonize', dest='isDaemon', help='Do you want to daemonize me?', action='store_true', default = False) parser.add_argument('--port', type=int, dest='daemonPort', default=5000, help='Specify a port number') args = parser.parse_args() if args.format_ == 'ssf' and not args.ssf_type: sys.stderr.write(parser.format_usage()) sys.stderr.write("converter-indic: error: argument --t: not specified\n") sys.exit(0) # set conversion direction if args.src_enc=="utf": #and args.trg_enc=="wx": src_trg = "utf2wx" else: #args.src_enc=="wx" and args.trg_enc=="utf": src_trg = "wx2utf" # initialize converter object con = wxConvert(src_trg, args.format_, args.lang, args.ssf_type, args.nested, args.mask) if args.isDaemon: host = "0.0.0.0" #Listen on all interfaces port = args.daemonPort #Port number tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tcpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) tcpsock.bind((host,port)) sys.stderr.write('Listening at %d\n' %port) while True: tcpsock.listen(4) #print "nListening for incoming connections..." (clientsock, (ip, port)) = tcpsock.accept() #pass clientsock to the ClientThread thread object being created newthread = ClientThread(ip, port, clientsock, args, con) newthread.start() else: processInput(args.infile, args.outfile, args, con) # close files args.infile.close() args.outfile.close() if __name__ == '__main__': main()