__init__.py 6.11 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
#!/usr/bin/env python 
# -*- coding: utf-8 -*-

"""Tokenizer for Indian scripts and Roman script.

This module provides a complete tokenizer for Indian languages including 
Urdu and Kashmiri and Roman script.

Copyright (c) 2015-2016 Irshad Ahmad <irshad.bhat@research.iiit.ac.in>

Distributed under MIT license [http://opensource.org/licenses/mit-license.html].
"""

from __future__ import print_function

__name__       = "Indic Tokenizer"
__author__     = "Irshad Ahmad"
__copyright__  = "Copyright (C) 2015-16 Irshad Ahmad"
__version__    = "1.0"
__license__    = "MIT"
__maintainer__ = "Irshad Ahmad"
__email__      = "irshad.bhat@research.iiit.ac.in"
__status__     = "Beta"
__all__        = ["indic_tokenize", "roman_tokenize"]

import sys
import socket
import argparse
import StringIO
import threading
import multiprocessing

from .indic_tokenize import tokenize_ind
from .roman_tokenize import tokenize_rom

_MAX_BUFFER_SIZE_ = 1024000 #1MB

def processInput(inFD, outFD, tzr):
    # convert data
    for line in inFD:
        line = tzr.tokenize(line)
        outFD.write('%s\n' %line)

class ClientThread(threading.Thread):
    def __init__(self, ip, port, clientsocket, tzr):
        threading.Thread.__init__(self)
        self.tzr = tzr
        self.ip = ip
        self.port = port
        self.csocket = clientsocket
        #print "[+] New thread started for "+ip+":"+str(port)

    def run(self):
        #print "Connection from : "+ip+":"+str(port)

        data = self.csocket.recv(_MAX_BUFFER_SIZE_)
        #print "Client(%s:%s) sent : %s"%(self.ip, str(self.port), data)
        fakeInputFile = StringIO.StringIO(data)
        fakeOutputFile = StringIO.StringIO("")
        processInput(fakeInputFile, fakeOutputFile, self.tzr)
        fakeInputFile.close()
        self.csocket.send(fakeOutputFile.getvalue())
        fakeOutputFile.close()
        self.csocket.close()

        #print "Client at "+self.ip+" disconnected..."

def ind_main():
    lang_help = """select language (3 letter ISO-639 code)
        Hindi       : hin
        Urdu        : urd
        Telugu      : tel
        Tamil       : tam
        Malayalam   : mal
        Kannada     : kan
        Bengali     : ben
        Oriya       : ori
        Punjabi     : pan
        Marathi     : mar
        Nepali      : nep
        Gujarati    : guj
        Bodo        : bod
        Konkani     : kok
        Assamese    : asm
        Kashmiri    : kas"""
    languages = "hin urd ben asm guj mal pan tel tam kan ori mar nep bod kok kas".split()
    # parse command line arguments 
    parser = argparse.ArgumentParser(prog="indic_tokenizer",
                                    description="Tokenizer for Indian Scripts",
                                    formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--i', metavar='input', dest="INFILE", type=argparse.FileType('r'), default=sys.stdin, help="<input-file>")
    parser.add_argument('--l', metavar='language', dest="lang", choices=languages, default='hin', help=lang_help)
    parser.add_argument('--s', dest='split_sen', action='store_true', help="set this flag to apply sentence segmentation")
    parser.add_argument('--o', metavar='output', dest="OUTFILE", type=argparse.FileType('w'), default=sys.stdout, help="<output-file>")
    parser.add_argument('--daemonize', dest='isDaemon', help='Do you want to daemonize me?', action='store_true', default = False)
    parser.add_argument('--port', type=int, dest='daemonPort', help='Specify a port number')
    args = parser.parse_args()

    # initialize convertor object
    tzr = tokenize_ind(lang=args.lang, split_sen=args.split_sen)

    # convert data
    if args.isDaemon and args.daemonPort:
        host = "0.0.0.0" #Listen on all interfaces
        port = args.daemonPort #Port number

        tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tcpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

        tcpsock.bind((host,port))

        while True:
            tcpsock.listen(multiprocessing.cpu_count())
            #print "nListening for incoming connections..."
            (clientsock, (ip, port)) = tcpsock.accept()

            #pass clientsock to the ClientThread thread object being created
            newthread = ClientThread(ip, port, clientsock, tzr)
            newthread.start()
    else:
        processInput(args.INFILE, args.OUTFILE, tzr)

    # close files 
    args.INFILE.close()
    args.OUTFILE.close()

def rom_main():
    # parse command line arguments 
    parser = argparse.ArgumentParser(prog="roman_tokenizer", description="Tokenizer for Roman-Script")
    parser.add_argument('--i', metavar='input', dest="INFILE", type=argparse.FileType('r'), default=sys.stdin, help="<input-file>")
    parser.add_argument('--s', dest='split_sen', action='store_true', help="set this flag to apply sentence segmentation")
    parser.add_argument('--o', metavar='output', dest="OUTFILE", type=argparse.FileType('w'), default=sys.stdout, help="<output-file>")
    parser.add_argument('--daemonize', dest='isDaemon', help='Do you want to daemonize me?', action='store_true', default = False)
    parser.add_argument('--port', type=int, dest='daemonPort', help='Specify a port number')
    args = parser.parse_args()

    # initialize convertor object
    tzr = tokenize_rom(split_sen=args.split_sen)

    # convert data
    if args.isDaemon and args.daemonPort:
        host = "0.0.0.0" #Listen on all interfaces
        port = args.daemonPort #Port number

        tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tcpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

        tcpsock.bind((host,port))

        while True:
            tcpsock.listen(multiprocessing.cpu_count())
            #print "nListening for incoming connections..."
            (clientsock, (ip, port)) = tcpsock.accept()

            #pass clientsock to the ClientThread thread object being created
            newthread = ClientThread(ip, port, clientsock, tzr)
            newthread.start()
    else:
        processInput(args.INFILE, args.OUTFILE, tzr)

    # close files 
    args.INFILE.close()
    args.OUTFILE.close()

if __name__ ==  '__main__':
    rom_main()
    #ind_main()