7.53 KB
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
#!/usr/bin/env python

# Copyright Irshad Ahmad Bhat 2015.

"""WX convertor: converts Indian languages to ASCII and vice-versa

WX notation is a transliteration scheme for representing Indian languages in ASCII.
For more details on WX go to <>.
This module is a UTF (Indian Scripts) to Roman (WX) convertor and vice-versa that:

    - converts text in 10 Indian languages viz. Hindi, Tamil, Telegu, Malayalam, 
      Bengali, Kannada, Oriya, Punjabi, Marathi and Nepali.
    - handles 5 data formats viz. plain-text, ssf, conll, bio and tnt.


import os
import re
import sys

from .wxILP import wxilp
from .ssf_reader import SSFReader

class wxConvert():
    """WX convertor (UTF to WX and vice-versa)
    Used to convert text in Indian languages to ASCII. It can be used for 9 Indian 
    languages viz. Hindi, Tamil, Telegu, Malayalam, Bengali, Kannada, Oriya, Punjabi,
    Marathi and Nepali in 5 data formats viz. plain-text, ssf, conll, bio and tnt.

    def __init__(self, order="wx2utf",
        self.lang = lang
        self.nested = nested
        self.format_ = format_
        self.ssf_type = ssf_type
        wxp = wxilp(self.lang, order, rmask)
        self.transform = wxp.wx2utf if order=="wx2utf" else wxp.utf2wx

    def convert_ssf(self, sentence):
        """Convert SSF data"""
        consen = str()
        obj = SSFReader(sentence)
        for node,order in zip(obj.nodeList, obj.fs_order):
            if self.ssf_type == 'intra' or (self.ssf_type == 'inter' and not
                name = self.transform( if not in self.special else
                head = self.transform(node.head) if self.nested else node.head
                name =
                head = self.transform(node.head) if node.head not in self.special else node.head
            if self.ssf_type == 'intra':
                parent = self.transform(node.parent) if node.parent not in self.special else node.parent
                parent = node.parent
            wordForm = self.transform(node.wordForm) if node.wordForm not in self.special else node.wordForm
            dmrel_ = 'dmrel' if node.dmrel else 'drel'
            ssfNode = [, wordForm, node.posTag]
            if isinstance(, tuple):
                nL =
                lemma = self.transform(nL.lemma) if nL.lemma not in self.special else nL.lemma
                vib = self.transform(nL.vib) if nL.vib not in self.special else nL.vib
                features = ",".join((lemma,, nL.gen, nL.num, nL.per,, vib, nL.tam))
                features =
            fs = [
                    "af='%s'" % (features) if else '',
                    "name='%s'" % (name) if name else None,
                    "head='%s'" % (head) if head else None,
                    "chunkId='%s'" % (node.chunkId) if (node.chunkId and node.chunkType == 'head') else None,
                    "chunkType='%s:%s'" % (node.chunkType, node.chunkId) if node.chunkType else None,
                    "posn='%s'" % (node.posn) if node.posn else None,
                    "vpos='%s'" % (node.vpos) if node.vpos else None,
                    "%s='%s:%s'" % (dmrel_, node.drel, parent) if node.drel else None,
                    "coref='%s:%s'" % (node.corel, node.coref) if node.coref else None,
                    "stype='%s'" % (node.stype) if node.stype else None,
                    "voicetype='%s'" % (node.voicetype) if node.voicetype else None,
                    "poslcat='%s'" % (node.poslcat) if node.poslcat else None,
                    "mtype='%s'" % (node.mtype) if node.mtype else None,
                    "troot='%s'" % (node.troot) if node.troot else None,
                    "etype='%s'" % (node.etype) if node.etype else None,
                    "etype_root='%s'" % (node.etype_root) if node.etype_root else None,
                    "emph='%s'" % (node.emph) if node.emph else None,
                    "esubtype='%s'" % (node.esubtype) if node.esubtype else None,
                    "etype_name='%s'" % (node.etype_name) if node.etype_name else None,
                    "agr_num='%s'" % (node.agr_num) if node.agr_num else None,
                    "hon='%s'" % (node.hon) if node.hon else None,
                    "agr_cas='%s'" % (node.agr_cas) if node.agr_cas else None,
                    "agr_gen='%s'" % (node.agr_gen) if node.agr_gen else None #NOTE add node
            fs_ = fs[:]
            for idx in order:
                fs.insert(0, fs_[idx])
            fs = "<fs %s>" % (" ".join(filter(None, fs)))
                consen += "%s\n" %("\t".join(ssfNode+[fs]))
                consen += "%s\n" %("\t))")

        return consen

    def convert_conll(self, conll):
        """Convert CONLL data"""
        trans_LINES = list()
        if isinstance(conll, unicode):
            conll = conll.encode('utf-8')
        lines = conll.split("\n")
        for line in lines:
            line = line.strip()
            if not line:
            line = line.split("\t")
            if len(line) != 10:
                sys.stderr.write("Warning: dimension mismatch (attributes < 10 or > 10) \n")
            FORM, LEMMA, FEATS = line[1], line[2], line[5].split("|")
            vib_id = [idx for idx,feat in enumerate(FEATS) if feat[:4]=="vib-"][0]
            vib = FEATS[vib_id].lstrip("vib-")
            vib = re.split("([+_0-9]+)", vib)
            vib = " ".join(vib).split()
            if not (FORM[0] == "&" and FORM[-1] == ";"):
                FORM = self.transform(FORM)
            if not (LEMMA[0] == "&" and LEMMA[-1] == ";"):
                LEMMA = self.transform(LEMMA)
            trans_FEATS = [FORM, LEMMA]
            for word in vib:
                if word in ["+", "_"] or word.isdigit():
                trans_word = self.transform(word)
            line[1] = trans_FEATS[0] if trans_FEATS[0].strip() else "_"
            line[2] = trans_FEATS[1] if trans_FEATS[1].strip() else "_"
            FEATS[vib_id] = "vib-%s" %"".join(trans_FEATS[2:])
            line[5] = "|".join(FEATS)
            trans_LINES.append("%s" %"\t".join(line))
        return "\n".join(trans_LINES)

    def convert(self, line):
        if self.format_=="text":
            return self.transform(line)
        elif self.format_=="ssf":
            self.special = set(['null', 'NULL', 'COMMA', 'SINGLE_QUOTE', '-JOIN'])
            return self.convert_ssf(line)
        elif self.format_=="conll":
            return self.convert_conll(line)
        elif self.format_ in ["bio", "tnt"]:
            trans_LINES = list()
            lines = line.split("\n")
            for line in lines:
                line = line.split()
                if not line:
                FORM = line[0]
                line[0] = self.transform(FORM)
                trans_LINES.append("%s" %"\t".join(line))
            return "\n".join(trans_LINES)
            sys.stderr("FormatError: invalid format :: %s\n" %self.format_)