Commit 71e1b081 authored by priyank's avatar priyank

updated tokenizer

parent e0d08a7a
......@@ -37,6 +37,7 @@ sub prune_default{
sub process {
my %par = @_;
utf8::encode($par{"data"});
my $input = $par{'data'};
read_story(\$input);
......@@ -72,6 +73,7 @@ sub process {
select(OUTFILE);
printstory();
select(STDOUT);
utf8::decode($result);
return $result;
}
......
......@@ -4,6 +4,13 @@ use warnings;
use Dir::Self;
use Data::Dumper;
use IPC::Run qw(run);
use List::UtilsBy qw(max_by);
use File::Temp qw/ tempfile /;
use File::Slurp qw( slurp );
my $cwd = __DIR__;
my %daemons = (
"tokenizer" => {
"path" => "ind-tokz",
......@@ -12,25 +19,23 @@ my %daemons = (
}
);
sub process {
my %args = @_;
utf8::encode($args{data});
my $sentences = call_daemon("tokenizer", $args{data});
open INFILE, '<', \$sentences or die $!;
my $result = "";
my $ctr = 0;
while (my $line = <INFILE>) {
$ctr ++;
$result .= "<Sentence id=\"$ctr\">\n";
my @words = split ' ', $line;
foreach my $index (0..$#words) {
$result .= $index + 1 . "\t$words[$index]\tunk\n";
}
$result .= "</Sentence>";
}
close INFILE;
utf8::decode($result);
return $result;
my ($fh2, $filename2) = tempfile("tokenizer_inputXXXX", DIR => "/tmp", SUFFIX => ".tmp");
print $fh2 $args{"data"};
close($fh2);
my $token_out;
run ["python", "$cwd/tokenize.py", $filename2], ">", \$token_out;
unlink $filename2 or die "Couldn't delete temp file! $filename2";
utf8::decode($token_out);
return $token_out;
}
sub run_daemons {
......@@ -62,6 +67,6 @@ sub call_daemon {
return $result;
}
run_daemons(("tokenizer"));
#run_daemons(("tokenizer"));
1;
package ILMT::KAN::HIN::Tokenizer;
use strict;
use warnings;
use Dir::Self;
use Data::Dumper;
my %daemons = (
"tokenizer" => {
"path" => "ind-tokz",
"args" => "--l kan --s --daemonize --port",
"port" => "8111"
}
);
sub process {
my %args = @_;
utf8::encode($args{data});
my $sentences = call_daemon("tokenizer", $args{data});
open INFILE, '<', \$sentences or die $!;
my $result = "";
my $ctr = 0;
while (my $line = <INFILE>) {
$ctr ++;
$result .= "<Sentence id=\"$ctr\">\n";
my @words = split ' ', $line;
foreach my $index (0..$#words) {
$result .= $index + 1 . "\t$words[$index]\tunk\n";
}
$result .= "</Sentence>";
}
#print Dumper($result);
close INFILE;
utf8::decode($result);
return $result;
}
sub run_daemons {
my @daemon_names = @_;
foreach my $daemon_name (@daemon_names) {
my %daemon = %{$daemons{$daemon_name}};
my $cmd = "$daemon{path} $daemon{args} $daemon{port} &";
my $runfile = __DIR__ . "/run/${daemon_name}_$daemon{port}";
system("flock -e -w 0.01 $runfile -c '$cmd'") == 0
or warn "[" . __PACKAGE__ . "]: Port $daemon{port} maybe unavailable! $?\n";
}
}
sub call_daemon {
my ($daemon_name, $input) = @_;
my $port = $daemons{$daemon_name}{port};
my ($socket, $client_socket);
$socket = new IO::Socket::INET (
PeerHost => '127.0.0.1',
PeerPort => $port,
Proto => 'tcp',
) or die "ERROR in Socket Creation : $!\n";
$socket->send("$input\n");
my $result = "";
while (my $line = $socket->getline) {
$result .= $line;
}
$socket->close();
return $result;
}
run_daemons(("tokenizer"));
1;
import os, sys, codecs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by
@author: priyank
'''
def tokenizer(text, ind):
"""Tokenize the text only on space."""
tokens = text.split()
tokens_ssf = [str(index + 1) + '\t' + token + '\tunk' for index, token in enumerate(tokens)]
tokens_ssf_with_sentence = ['<Sentence id="'+str(ind+1)+'">'] + tokens_ssf + ['</Sentence>']
return '\n'.join(tokens_ssf_with_sentence)
f = codecs.open(sys.argv[1], "rb", "utf-8")
lines = f.readlines()
f.close()
finalOutput = ""
ii = 0
for line in lines:
line = line.strip()
if line:
finalOutput = finalOutput + tokenizer(line, (ii)) + "\n"
ii = ii + 1
print (finalOutput.encode('utf-8'))
[submodule "dependencies/indic-wx-converter"]
path = dependencies/indic-wx-converter
url = https://github.com/ltrc/indic-wx-converter
use strict;
use warnings;
use Data::Dumper;
use Graph::Directed;
use JSON;
use List::Util qw(reduce);
use Mojolicious::Lite;
use Mojo::Redis2;
use lib "./lib";
use ILMT::KAN::HIN::WXZ2UTF;
my $modulename = "ilmt.kan.hin.wx2utf";
my %database = ();
helper redis => sub {
state $r = Mojo::Redis2->new(url => "redis://redis:6379");
};
sub process {
my $hash = $_[0];
my %newhash;
if (keys %{$hash} == 1) {
%newhash = (data => (%{$hash})[1]);
} else {
@newhash{ map { s/_[^_]*$//r } keys %{$hash} } = values %{$hash};
}
return ILMT::KAN::HIN::WXZ2UTF::process(%newhash);
}
sub genError {
my $c = shift;
my $error = shift;
$c->render(json => to_json({Error => $error}), status => 400);
}
sub genDAGGraph {
my %edges = %{$_[0]};
my $g = Graph::Directed->new();
foreach my $from (keys %edges) {
foreach my $to (@{$edges{$from}}) {
$g->add_edge($from, $to);
}
}
return $g;
}
post '/pipeline' => sub {
my $c = shift;
my $ilmt_json = decode_json($c->req->body);
my $ilmt_modid = $ilmt_json->{modid} || genError($c, "No ModuleID Specified!") && return;
my $ilmt_jobid = $ilmt_json->{jobid} || genError($c, "No JobID Specified!") && return;
my $ilmt_data = $ilmt_json->{data} || genError($c, "No Data Specified!") && return;
my $ilmt_dag = genDAGGraph($ilmt_json->{edges});
genError($c, "Edges not specified!") && return if (!$ilmt_dag);
my $ilmt_module = $modulename . '_' . $ilmt_modid;
my @ilmt_inputs = map {@$_[0]} $ilmt_dag->edges_to($ilmt_module);
if (!$database{$ilmt_jobid}) {
$database{$ilmt_jobid} = {};
$database{"data_$ilmt_jobid"} = {};
}
foreach (@ilmt_inputs) {
my $input_module = $_ =~ s/_[^_]*$//r;
$database{$ilmt_jobid}{$input_module} = $ilmt_data->{$_} if $ilmt_data->{$_};
}
%{$database{"data_$ilmt_jobid"}} = (%{$database{"data_$ilmt_jobid"}}, %{$ilmt_data});
if (@ilmt_inputs == keys %{$database{$ilmt_jobid}}) {
$c->render(json => "{Response: 'Processing...'}", status => 202);
my $ilmt_output = process($database{$ilmt_jobid});
$ilmt_data->{$ilmt_module} = $ilmt_output;
%{$ilmt_data} = (%{$ilmt_data}, %{$database{"data_$ilmt_jobid"}});
my @ilmt_next = map {@$_[1]} $ilmt_dag->edges_from($ilmt_module);
if (@ilmt_next) {
foreach (@ilmt_next) {
my @module_info = split(/_([^_]+)$/, $_);
my $next_module = $module_info[0];
$ilmt_json->{modid} = $module_info[1];
$c->ua->post("http://$next_module/pipeline" => json
=> from_json(encode_json($ilmt_json), {utf8 => 1}) => sub {
my ($ua, $tx) = @_;
my $msg = $tx->error ? $tx->error->{message} : $tx->res->body;
$c->app->log->debug("[$ilmt_jobid]: $msg\n");
});
}
} else {
$c->redis->publish($ilmt_jobid => encode_json($ilmt_json));
}
delete $database{$ilmt_jobid};
} else {
$c->render(json => "{Response: 'Waiting for more inputs...'}", status => 202);
}
};
app->start;
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
Jul-2015 Irshad Ahmad <irshad.bhat@research.iiit.ac.in>
* Version 1.5.0
- Gujurati language support added.
-
Jul-2015 Irshad Ahmad <irshad.bhat@research.iiit.ac.in>
* Version 1.5.0
- Python implementation
-
Jan-2012 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.9
- Oriya language support added.
-
29-Dec-2011 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.8
- reading and writting utf8 file issue solved by putting -C option in perl
command as argument. (e.g. perl -C prog.pl <options> <input-file>)
- fix the issue consonant+nukata+halant in Hindi (e.g. word: wanaKZvaxara)
- head and name conversion in ssf is avoided as per HCU-CALTS feedback.
14-Mar-2011 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.7
- In wx2utf the words start with ^ or ^@ just leave as it is.
- In case of tamil wx2utf 0BB6 (S) mapped to 0BB7 (R)
- M map is changed m+halant (as per Dr. Ramnan feedback)
7-Mar-2011 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.6
- iscii2unicode_tel function some mapping commented like ऴ etc.
28-Feb-2011 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.5
- fix ऩ issue from wx2utf in telugu.
- case added in tests directory (sampale-cases-tel-wx.txt)
- tamil normailze issue two-part dependent vowel signs solved.
21-Feb-2011 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.4
- fix punc issue in ssf input.
- fix ~ issue in input and output file path.
06-Apr-2010 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.3
- incorporate the output option from the command line option.
- create convertor_indic_lib.pl for convertor library.
- create sample-convertor-call.pl for how to call convertor library.
28-Feb-2011 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.5
- fix ऩ issue from wx2utf in telugu.
- case added in tests directory (sampale-cases-tel-wx.txt)
- tamil normailze issue two-part dependent vowel signs solved.
21-Feb-2011 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.4
- fix punc issue in ssf input.
- fix ~ issue in input and output file path.
06-Apr-2010 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.3
- incorporate the output option from the command line option.
- create convertor_indic_lib.pl for convertor library.
- create sample-convertor-call.pl for how to call convertor library.
18-Mar-2010 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.2
- wx2utf.pl and utf2wx.pl files function are moved in lib/IndicCC.pl.
- incorporate the small code in wx2utf function in lib/IndicCC.pl.
- to escap the other language string as it is.
18-Mar-2010 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.1
- tested 1050 uniq words. input file in tests/tam/uniq-words-tam-aukbc-[wx|utf].txt
- isuue related wx2utf are fixed by incorporating the variou case in wx2iscii sub. in lib/IndicCC.pl
- issue reagrding utf2wx are fixed by incorporating 3-lines in iscii2wx sub. in lib/IndicCC.pl
- "\x{D0}"=> rY, "\x{C7}"=>nY, "\x{D3}"=>lYY
- added the support of tamil language.
15-Mar-2010 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.4.0
- test wx2utf-tel, utf2wx-tel and fix the various cases like lYa,lYi, lY[MHz] etc.
- resolve full stop issue from wx2utf direction for hindi.
- we have compare the changed IndicCC.pl to old one and undo the changes
that Harika Indu did.
08-May-2010 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.3.0
- text conversion incorporated.
- input option incorporated in command line argument parsing.
- tel ssf test data incorporated.
03-03-2010 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.2.0
- changed as per requirement of ILMT project Sampark.
- this version is currently working on SSF format only.
- test on hindi data ONLY.
08-02-10 Rashid Ahmad <rashid101b@gmail.com>
* Version 1.1.0
- This version is modified of the base version.
- created the sample pl file for test convertor.
- Base version is developed in perl by Pawan Kumar, Rashid Ahmad and Avinesh.
10-Aug-09 Pawan Kumar <hawahawai@gmail.com>
* Version 1.0.0
- Base Version.
The MIT License (MIT)
Copyright (c) 2015 Irshad Ahmad
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
#! /usr/bin/env python
import re
import sys
import socket
import argparse
import StringIO
import threading
from argparse import RawTextHelpFormatter
from .ilp import wxConvert
__name__ = "converter-indic"
__doc__ = "python-converter-indic: Converts Indian languages to WX (ASCII) and vice-versa"
__author__ = "Irshad Ahmad"
__version__ = "1.0.3"
__license__ = "MIT"
__maintainer__ = "Irshad Ahmad"
__email__ = "irshad.bhat@research.iiit.ac.in"
__status__ = "Beta"
__all__ = ["ilp", "wxILP", "ssf_reader", "main"]
_MAX_BUFFER_SIZE_ = 102400 #100KB
class ClientThread(threading.Thread):
def __init__(self, ip, port, clientsocket, args, con):
threading.Thread.__init__(self)
self.ip = ip
self.con = con
self.port = port
self.args = args
self.csocket = clientsocket
#print "[+] New thread started for "+ip+":"+str(port)
def run(self):
#print "Connection from : "+ip+":"+str(port)
data = self.csocket.recv(_MAX_BUFFER_SIZE_)
#print "Client(%s:%s) sent : %s"%(self.ip, str(self.port), data)
fakeInputFile = StringIO.StringIO(data)
fakeOutputFile = StringIO.StringIO("")
processInput(fakeInputFile, fakeOutputFile, self.args, self.con)
fakeInputFile.close()
self.csocket.send(fakeOutputFile.getvalue())
fakeOutputFile.close()
self.csocket.close()
#print "Client at "+self.ip+" disconnected..."
def processInput(ifp, ofp, args, con):
if args.format_ == "ssf":
if args.nested:
sentences = re.finditer("(<Sentence id=.*?>\s*\n.*?)\n(.*?)\)\)\s*\n</Sentence>", ifp.read(), re.S)
else:
sentences = re.finditer("(<Sentence id=.*?>)(.*?)</Sentence>", ifp.read(), re.S)
for sid_sentence in sentences:
sid = sid_sentence.group(1)
sentence = sid_sentence.group(2).strip()
ofp.write('%s\n' %sid)
consen = con.convert(sentence)
ofp.write('%s' %consen)
if args.nested:
ofp.write("\t))\n")
ofp.write("</Sentence>\n\n")
else:
for line in ifp:
line = con.convert(line)
ofp.write(line)
def main():
format_list = 'text ssf conll bio tnt'.split()
languages = 'hin tel tam mal kan ben ori pan mar nep guj bod kok asm urd'.split()
# help messages
src_enc_help = "select input-file encoding [utf|wx]"
#trg_enc_help = "select output-file encoding [utf|wx]"
format_help = "select output format [text|ssf|conll|bio|tnt]"
lang_help = """select language (3 letter ISO-639 code)
Hindi : hin
Telugu : tel
Tamil : tam
Malayalam : mal
Kannada : kan
Bengali : ben
Oriya : ori
Punjabi : pan
Marathi : mar
Nepali : nep
Gujarati : guj
Bodo : bod
Konkani : kok
Assamese : asm
Urdu : urd"""
ssf_help = "specify ssf-type [inter|intra] in case file format (--f) is ssf"
# parse command line arguments
parser = argparse.ArgumentParser(prog="converter-indic",
description="wx-utf converter for Indian languages",
formatter_class=RawTextHelpFormatter)
parser.add_argument('--v', action="version", version="%(prog)s 1.0.3")
parser.add_argument('--l', metavar='language', dest="lang", choices=languages, default="hin", help="%s" %lang_help)
parser.add_argument('--s', metavar='source', dest="src_enc", choices=["utf","wx"], default="utf", help="%s" %src_enc_help)
parser.add_argument('--f', metavar='format', dest="format_", choices=format_list, default="text", help="%s" %format_help)
parser.add_argument('--t', metavar='ssf-type', dest="ssf_type", choices=["inter","intra"], default=None, help=ssf_help)
parser.add_argument('--n', dest='nested', action='store_true', help="set this flag for nested ssf")
parser.add_argument('--m', dest='mask', action='store_false', help="set this flag to keep off masking of roman strings in Indic text")
parser.add_argument('--i', metavar='input', dest="infile", type=argparse.FileType('r'), default=sys.stdin, help="<input-file>")
parser.add_argument('--o', metavar='output', dest="outfile", type=argparse.FileType('w'), default=sys.stdout, help="<output-file>")
parser.add_argument('--daemonize', dest='isDaemon', help='Do you want to daemonize me?', action='store_true', default = False)
parser.add_argument('--port', type=int, dest='daemonPort', default=5000, help='Specify a port number')
args = parser.parse_args()
if args.format_ == 'ssf' and not args.ssf_type:
sys.stderr.write(parser.format_usage())
sys.stderr.write("converter-indic: error: argument --t: not specified\n")
sys.exit(0)
# set conversion direction
if args.src_enc=="utf": #and args.trg_enc=="wx":
src_trg = "utf2wx"
else: #args.src_enc=="wx" and args.trg_enc=="utf":
src_trg = "wx2utf"
# initialize converter object
con = wxConvert(src_trg, args.format_, args.lang, args.ssf_type, args.nested, args.mask)
if args.isDaemon:
host = "0.0.0.0" #Listen on all interfaces
port = args.daemonPort #Port number
tcpsock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
tcpsock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
tcpsock.bind((host,port))
sys.stderr.write('Listening at %d\n' %port)
while True:
tcpsock.listen(4)
#print "nListening for incoming connections..."
(clientsock, (ip, port)) = tcpsock.accept()
#pass clientsock to the ClientThread thread object being created
newthread = ClientThread(ip, port, clientsock, args, con)
newthread.start()
else:
processInput(args.infile, args.outfile, args, con)
# close files
args.infile.close()
args.outfile.close()
if __name__ == '__main__':
main()
#!/usr/bin/env python
import sys
if __package__ is None and not hasattr(sys, "frozen"):
# direct call of __main__.py
import os.path
path = os.path.realpath(os.path.abspath(__file__))
sys.path.append(os.path.dirname(os.path.dirname(path)))
import converter_indic
if __name__ == "__main__":
converter_indic.main()
#!/usr/bin/env python
# Copyright Irshad Ahmad Bhat 2015.
"""WX convertor: converts Indian languages to ASCII and vice-versa
WX notation is a transliteration scheme for representing Indian languages in ASCII.
For more details on WX go to <https://en.wikipedia.org/wiki/WX_notation>.
This module is a UTF (Indian Scripts) to Roman (WX) convertor and vice-versa that:
- converts text in 10 Indian languages viz. Hindi, Tamil, Telegu, Malayalam,
Bengali, Kannada, Oriya, Punjabi, Marathi and Nepali.
- handles 5 data formats viz. plain-text, ssf, conll, bio and tnt.
"""
import os
import re
import sys
from .wxILP import wxilp
from .ssf_reader import SSFReader
class wxConvert():
"""WX convertor (UTF to WX and vice-versa)
Used to convert text in Indian languages to ASCII. It can be used for 9 Indian
languages viz. Hindi, Tamil, Telegu, Malayalam, Bengali, Kannada, Oriya, Punjabi,
Marathi and Nepali in 5 data formats viz. plain-text, ssf, conll, bio and tnt.
"""
def __init__(self, order="wx2utf",
format_="text",
lang="hin",
ssf_type=None,
nested=False,
rmask=True):
self.lang = lang
self.nested = nested
self.format_ = format_
self.ssf_type = ssf_type
wxp = wxilp(self.lang, order, rmask)
self.transform = wxp.wx2utf if order=="wx2utf" else wxp.utf2wx
def convert_ssf(self, sentence):
"""Convert SSF data"""
consen = str()
obj = SSFReader(sentence)
obj.getAnnotations()
for node,order in zip(obj.nodeList, obj.fs_order):
if self.ssf_type == 'intra' or (self.ssf_type == 'inter' and not node.id.isdigit()):
name = self.transform(node.name) if node.name not in self.special else node.name
head = self.transform(node.head) if self.nested else node.head
else:
name = node.name
head = self.transform(node.head) if node.head not in self.special else node.head
if self.ssf_type == 'intra':
parent = self.transform(node.parent) if node.parent not in self.special else node.parent
else:
parent = node.parent
wordForm = self.transform(node.wordForm) if node.wordForm not in self.special else node.wordForm
dmrel_ = 'dmrel' if node.dmrel else 'drel'
ssfNode = [node.id, wordForm, node.posTag]
if isinstance(node.af, tuple):
nL = node.af
lemma = self.transform(nL.lemma) if nL.lemma not in self.special else nL.lemma
vib = self.transform(nL.vib) if nL.vib not in self.special else nL.vib
features = ",".join((lemma, nL.cat, nL.gen, nL.num, nL.per, nL.case, vib, nL.tam))
else:
features = node.af
fs = [
"af='%s'" % (features) if node.af else '',
"name='%s'" % (name) if name else None,
"head='%s'" % (head) if head else None,
"chunkId='%s'" % (node.chunkId) if (node.chunkId and node.chunkType == 'head') else None,
"chunkType='%s:%s'" % (node.chunkType, node.chunkId) if node.chunkType else None,
"posn='%s'" % (node.posn) if node.posn else None,
"vpos='%s'" % (node.vpos) if node.vpos else None,
"%s='%s:%s'" % (dmrel_, node.drel, parent) if node.drel else None,
"coref='%s:%s'" % (node.corel, node.coref) if node.coref else None,
"stype='%s'" % (node.stype) if node.stype else None,
"voicetype='%s'" % (node.voicetype) if node.voicetype else None,
"poslcat='%s'" % (node.poslcat) if node.poslcat else None,
"mtype='%s'" % (node.mtype) if node.mtype else None,
"troot='%s'" % (node.troot) if node.troot else None,
"etype='%s'" % (node.etype) if node.etype else None,
"etype_root='%s'" % (node.etype_root) if node.etype_root else None,
"emph='%s'" % (node.emph) if node.emph else None,
"esubtype='%s'" % (node.esubtype) if node.esubtype else None,
"etype_name='%s'" % (node.etype_name) if node.etype_name else None,
"agr_num='%s'" % (node.agr_num) if node.agr_num else None,
"hon='%s'" % (node.hon) if node.hon else None,
"agr_cas='%s'" % (node.agr_cas) if node.agr_cas else None,
"agr_gen='%s'" % (node.agr_gen) if node.agr_gen else None #NOTE add node
]
fs_ = fs[:]
for idx in order:
fs.remove(fs_[idx])
fs.insert(0, fs_[idx])
fs = "<fs %s>" % (" ".join(filter(None, fs)))
if node.id:
consen += "%s\n" %("\t".join(ssfNode+[fs]))
else:
consen += "%s\n" %("\t))")
return consen
def convert_conll(self, conll):
"""Convert CONLL data"""
trans_LINES = list()
if isinstance(conll, unicode):
conll = conll.encode('utf-8')
lines = conll.split("\n")
for line in lines:
line = line.strip()
if not line:
trans_LINES.append("")
continue
line = line.split("\t")
if len(line) != 10:
sys.stderr.write("Warning: dimension mismatch (attributes < 10 or > 10) \n")
FORM, LEMMA, FEATS = line[1], line[2], line[5].split("|")
vib_id = [idx for idx,feat in enumerate(FEATS) if feat[:4]=="vib-"][0]
vib = FEATS[vib_id].lstrip("vib-")
vib = re.split("([+_0-9]+)", vib)
vib = " ".join(vib).split()
if not (FORM[0] == "&" and FORM[-1] == ";"):
FORM = self.transform(FORM)
if not (LEMMA[0] == "&" and LEMMA[-1] == ";"):
LEMMA = self.transform(LEMMA)
trans_FEATS = [FORM, LEMMA]
for word in vib:
if word in ["+", "_"] or word.isdigit():
trans_FEATS.append(word)
continue
trans_word = self.transform(word)
trans_FEATS.append(trans_word)
line[1] = trans_FEATS[0] if trans_FEATS[0].strip() else "_"
line[2] = trans_FEATS[1] if trans_FEATS[1].strip() else "_"
FEATS[vib_id] = "vib-%s" %"".join(trans_FEATS[2:])
line[5] = "|".join(FEATS)
trans_LINES.append("%s" %"\t".join(line))
return "\n".join(trans_LINES)
def convert(self, line):
if self.format_=="text":
return self.transform(line)
elif self.format_=="ssf":
self.special = set(['null', 'NULL', 'COMMA', 'SINGLE_QUOTE', '-JOIN'])
return self.convert_ssf(line)
elif self.format_=="conll":
return self.convert_conll(line)
elif self.format_ in ["bio", "tnt"]:
trans_LINES = list()
lines = line.split("\n")
for line in lines:
line = line.split()
if not line:
trans_LINES.append("")
continue
FORM = line[0]
line[0] = self.transform(FORM)
trans_LINES.append("%s" %"\t".join(line))
return "\n".join(trans_LINES)
else:
sys.stderr("FormatError: invalid format :: %s\n" %self.format_)
sys.exit(0)
ﺀ ء
ﺁ آ
ﺍ ا
ﺏ ب
ﺓ ت
ﺕ ت
ﺙ ث
ﺝ ج
ﺡ ح
ﺥ خ
ﺩ د
ﺫ ذ
ﺭ ر
ﺯ ز
ﺱ س
ﺵ ش
ﺹ ص
ﺽ ض
ﻁ ط
ﻅ ظ
ﻉ ع
ﻍ غ
ﻑ ف
ﻕ ق
ﻙ ک
ك ک
ﻻ لا
ﻝ ل
ﻡ م
ﻥ ن
ﻩ ہ
ﻫ ھ
ﻬ ھ
ه ھ
ﺅ ؤ
ﻭ و
ﻱ ی
ي ی
ﻯ ی
ى ی
ﺉ ئ
٠ ۰
١ ۱
٢ ۲
٣ ۳
٤ ۴
٥ ۵
٦ ۶
٧ ۷
٨ ۸
٩ ۹
ؤ vY
ئ IY
ۓ EYY
أ aY
ۀ HY
ا a
آ A
ک k
گ g
چ c
ج j
ٹ t
ڈ d
ت w
د x
پ p
ب b
م m
ن n
ل l
ڑ dY
ں z
ث sY
س s
ص sYY
ش S
ح h
خ KY
ذ jY
ز jYY
ژ jVY
ض jyy
ظ jy
ر r
ط wY
ع EY
غ gY
ف PY
ق q
و v
ہ H
ی I
ے E
ِ i
َ e
ُ u
ٗ U
ّ R
ْ Z
ٰ qV
ٖ GV
ً qf
ٍ qF
ھ hY
ٔ Q
ء QY
ٕ QYY
؛ ;
، ,
۔ .
٪ %
٭ *
؟ ?
٬ '
‘ '
’ '
“ "
” "
۰ 0
۱ 1
۲ 2
۳ 3
۴ 4
۵ 5
۶ 6
۷ 7
۸ 8
۹ 9
K کھ
G گھ
C چھ
J جھ
T ٹھ
D ڈھ
W تھ
X دھ
P پھ
B بھ
M مھ
N نھ
L لھ
a ا
A آ
k ک
g گ
c چ
j ج
t ٹ
d ڈ
w ت
x د
p پ
b ب
m م
n ن
l ل
z ں
s س
S ش
h ح
r ر
q ق
v و
H ہ
I ی
E ے
i ِ
e َ
u ُ
U ٗ
R ّ
Z ْ
Q ٔ
; ؛
, ،
. ۔
% ٪
* ٭
? ؟
0 ۰
1 ۱
2 ۲
3 ۳
4 ۴
5 ۵
6 ۶
7 ۷
8 ۸
9 ۹
#!/usr/bin/python
# Copyright Riyaz Ahmad Bhat, Irshad Ahmad Bhat 2015.
import re
from collections import namedtuple, OrderedDict
class SSFReader():
def __init__ (self, sentence):
self.tokens = list()
self.fs_order = list()
self.nodeList = list()
self.sentence = sentence
fs_node = ('af', 'name', 'head', 'chunkId', 'chunkType', 'posn', 'vpos', 'drel', 'coref',
'stype', 'voicetype', 'poslcat', 'mtype', 'troot', 'etype', 'etype_root', 'emph',
'esubtype', 'etype_name', 'agr_num', 'hon', 'agr_cas', 'agr_gen') #NOTE add node
nodes = ('id', 'wordForm', 'posTag', 'af', 'name', 'head', 'chunkId', 'chunkType', 'posn',
'vpos', 'drel', 'coref', 'stype', 'voicetype', 'poslcat', 'mtype', 'troot', 'corel',
'parent', 'dmrel', 'etype', 'etype_root', 'emph', 'esubtype', 'etype_name', 'agr_num',
'hon', 'agr_cas', 'agr_gen') #NOTE add node
self.node = namedtuple('node', nodes)
self.maping = dict(zip(fs_node, range(len(fs_node))))
self.features = namedtuple('features', ('lemma', 'cat', 'gen', 'num', 'per', 'case', 'vib', 'tam'))
def morphFeatures (self, af):
"""LEMMA, CAT, GEN, NUM, PER, CASE, VIB, TAM"""
af = af[1:-1].split(",")
assert len(af) == 8 #NOTE no need to process trash!
return af
def buildNode(self, id_, form_, tag_, pairs_):
wordForm_, Tag_, name_, head_, posn_, vpos_, chunkId_, chunkType_, depRel_, = [str()]*9 #NOTE add node
corel_, coref_, parent_, stype_, voicetype_, features_, poslcat_, mtype_, troot_ = [str()]*9
etype_, etype_root_, emph_, esubtype_, etype_name_, agr_num_, hon_, agr_cas_, agr_gen_ = [str()]*9
wordForm_, Tag_ = form_, tag_
for key, value in pairs_.items():
if key == "af":
lemma_, cat_, gen_, num_, per_, case_, vib_, tam_ = self.morphFeatures(value)
features_ = self.features(lemma_, cat_, gen_, num_, per_, case_, vib_, tam_)
elif key == "name":
name_ = re.sub("'|\"", '', value) #NOTE word is used as word in deprel
elif key == "chunkType":
assert len(value.split(":", 1)) == 2 # no need to process trash! FIXME
chunkType_, chunkId_ = re.sub("'|\"", '', value).split(":", 1)
elif key == "head":
head_ = re.sub("'|\"", '', value)
elif key == "posn":
posn_ = re.sub("'|\"", '', value)
elif key == "vpos":
vpos_ = re.sub("'|\"", '', value)
elif key == "poslcat":
poslcat_ = re.sub("'|\"", '', value)
elif key == "mtype":
mtype_ = re.sub("'|\"", '', value)
elif key == "troot":
troot_ = re.sub("'|\"", '', value)
elif key == "drel":
assert len(value.split(":", 1)) == 2 # no need to process trash! FIXME
depRel_, parent_ = re.sub("'|\"", '', value).split(":", 1)
assert depRel_ and parent_ # no need to process trash! FIXME
elif key == "coref":
try: corel_, coref_ = re.sub("'|\"", '', value).split(":")
except ValueError: corel_, coref_ = '', re.sub("'|\"", '', value)
elif key == "stype":
stype_ = re.sub("'|\"", '', value)
elif key == "voicetype":
voicetype_ = re.sub("'|\"", '', value)
elif key == "etype":
etype_ = re.sub("'|\"", '', value)
elif key == "etype_root":
etype_root_ = re.sub("'|\"", '', value)
elif key == "emph":
emph_ = re.sub("'|\"", '', value)
elif key == "esubtype":
esubtype_ = re.sub("'|\"", '', value)
elif key == "etype_name":
etype_name_ = re.sub("'|\"", '', value)
elif key == "agr_num":
agr_num_ = re.sub("'|\"", '', value)
elif key == "hon":
hon_ = re.sub("'|\"", '', value)
elif key == "agr_cas":
agr_cas_ = re.sub("'|\"", '', value)
elif key == "agr_gen":
agr_gen_ = re.sub("'|\"", '', value) #NOTE add node
self.fs_order.append([self.maping[x] for x in pairs_.keys() if x in self.maping][::-1])
self.nodeList.append(self.node(id_, wordForm_, Tag_.decode("ascii", 'ignore').encode("ascii"),
features_, name_, head_, chunkId_, chunkType_, posn_, vpos_, depRel_, coref_,
stype_, voicetype_, poslcat_, mtype_, troot_, corel_, parent_, self.dmrel_,
etype_, etype_root_, emph_, esubtype_, etype_name_, agr_num_, hon_, agr_cas_, agr_gen_)) #NOTE add node
def FSPairs(self, FS):
feats = OrderedDict()
self.dmrel_ = False
for feat in FS.split():
if "=" not in feat: continue
if 'dmrel' in feat:
self.dmrel_ = True
feat = feat.replace("dmrel", "drel")
feat = re.sub("af='+", "af='", feat)
feat = re.sub("af='+", "af='", feat)
attribute, value = feat.split("=")
feats[attribute] = value
return feats
def getAnnotations(self):
for line in self.sentence.split("\n"):
if '\t' not in line:
raise ValueError('Corrupted ssf: Tabs broken into spaces')
line = line.split('\t')
if line[0].isdigit():
assert len(line) == 4 # no need to process trash! FIXME
id_, oBraces_, Tag_ = line[:3]
attributeValue_pairs = self.FSPairs(line[3][4:-1])
self.buildNode(id_, oBraces_, Tag_, attributeValue_pairs)
elif line[0].replace(".", '').isdigit():
id_, wordForm_, Tag_ = line[:3]
attributeValue_pairs = self.FSPairs(line[3][4:-1])
assert wordForm_.strip() and Tag_.strip() # no need to process trash! FIXME
self.buildNode(id_, wordForm_, Tag_, attributeValue_pairs)
else:
self.buildNode('', '))', '', {})
return self
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import os.path
import warnings
from setuptools import setup
try:
setuptools_available = True
except ImportError:
from distutils.core import setup
setuptools_available = False
try:
import py2exe
except ImportError:
if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe':
print("Cannot import py2exe", file=sys.stderr)
exit(1)
py2exe_options = {
"bundle_files": 1,
"compressed": 1,
"optimize": 2,
"dist_dir": '.',
"dll_excludes": ['w9xpopen.exe'],
}
py2exe_console = [{
"script": "./converter_indic/__main__.py",
"dest_base": "converter-indic",
}]
py2exe_params = {
'console': py2exe_console,
'options': {"py2exe": py2exe_options},
'zipfile': None
}
if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe':
params = py2exe_params
else:
files_spec = [
('share/doc/converter-indic', ['README.rst'])
]
root = os.path.dirname(os.path.abspath(__file__))
data_files = []
for dirname, files in files_spec:
resfiles = []
for fn in files:
if not os.path.exists(fn):
warnings.warn('Skipping file %s since it is not present. Type make to build all automatically generated files.' % fn)
else:
resfiles.append(fn)
data_files.append((dirname, resfiles))
params = {
'data_files': data_files,
}
params['entry_points'] = {'console_scripts': ['converter-indic = converter_indic:main']}
# Get the package version
exec(compile(open('converter_indic/version.py').read(),
'converter_indic/version.py', 'exec'))
setup(
name = "python-converter-indic",
version = __version__,
description="UTF to WX converter and vice-versa for Indian Languages",
long_description = open('README.rst', 'rb').read().decode('utf8'),
keywords = ['UTF', 'WX', 'Unicode', 'Computational Linguistics',
'Indic', 'ASCII', 'conll', 'ssf', 'bio', 'tnt'],
author='Irshad Ahmad',
author_email='irshad.bhat@research.iiit.ac.in',
maintainer='Irshad Ahmad',
maintainer_email='irshad.bhat@research.iiit.ac.in',
license = "MIT",
url="https://github.com/irshadbhat/python-converter-indic",
package_dir={'converter_indic':'converter_indic'},
packages=['converter_indic'],
package_data={'converter_indic': ['mapping/*']},
classifiers=[
"Topic :: Text Processing :: Linguistic",
"Topic :: Software Development :: Libraries :: Python Modules",
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 2.7",
"Operating System :: OS Independent"
],
**params
)
1 nAnakaSAhI kElazdara nUz lAgU karavAuNa laI pahilI nUz azmriwasara ikaYwarawA
2 ca SAmila hovAMge : mAna
3 ApaNIAM haYkI mazgAM laI CewI wiYKA sazGaraSa viYDaNage vEtaranarI izsapEkatara
4 britiSa siYKa kOMsala vaYloM pAkisawAnoM ujadZa ke Ae siYKAM xI maxaxa
5 pazjAba vaYloM srI gurU rAmaxAsa havAI aYde nUz kOmAMwarI keMxara baNAe jANa xI mazga
6 kAnavEMta sakUlAM vaYloM sUbe xI pazjAbI BASA bolaNa
7 we pAbazxI lagAuNA sarAsara galawa : jaWe avawAra sizGa
8 sa. amarajIwa sizGa cAvalA pAratI xI pI.e.sI. xe mEMbara awe mIwa praXAna niyukawa
9 bAxala vaYloM peSa kIwA pazWaka ejazdA hAWI ke xAMwa
10 izdolA xe sakina nUz sUta karana vAle kalara xI reMja lAMca kIwI
11 jZilhA prISaxa xI mItizga xOrAna vaYKa-vaYKa mawe pAsa
12 kOmI paSU Xana cEMpIanaSipa pazjAba sarakAra xA supanamaI projEkata : gulajZAra sizGa raNIke
13 ca pazjAba sEra sapAtA Kewara viYca xeSa xA moharI sUbA baNiA
14 hariANA sarakAra lokAM nAla kIwe vAaxe pahila xe AXAra
15 we pUrA karegI : muYKa mazwarI
16 akAlI xala kOmAMwarI jAMca cOMkIAM nedZe lAegA 3 viSAla Xarane
17 we roka iYka hora mAmale
18 ca 5261 karodZa rupae xIAM sadZakI yojanAvAM nUz manajZUrI
19 84 siYKa kawaleAma xe xoSIAM nUz sajZA xavAuNa laI vikarama sizGa KAlasA xI BuYKa hadZawAla 21veM xina
20 BArawI janawA pAratI we samUha nagara GaYgA vaYloM mArakIta kametI pAwadZAM xe cearamEna niramala sizGa we upa cearamEna ravizxara
21 we puYjA, Xuzxa kArana rela we havAI sevA praBAviwa
22 celA hI nikaliA aXiApikA xA kAwala
23 akAlI xala kOmAMwarI jAMca cOMkIAM nedZe lAegA 3 viSAla Xarane
24 pAratI xe saraprasawa muYKa mazwarI bAxala vaYloM ‘naSA mukawa BArawa’ xA saYxA jalazXara/cazdIgadZha, 30 xasazbara- SromaNI akAlI xala vaYloM azwararASatarI sarahaYxa ’we jAMca cOMkIAM nedZe 3 viSAla Xarane lalAe jANage wAM jo naSiAM xI wasakarI viruYXa jAgarUkawA pExA karana xe nAla-nAla pAkisawAna nAla lagaxI sarahaYxa ’we cOkasI vaXAuNa xI lodZa ...
25 we roka iYka hora mAmale
26 ca 5261 karodZa rupae xIAM sadZakI yojanAvAM nUz manajZUrI
27 84 siYKa kawaleAma xe xoSIAM nUz sajZA xavAuNa laI vikarama sizGa KAlasA xI BuYKa hadZawAla 21veM xina
28 BArawI janawA pAratI we samUha nagara GaYgA vaYloM mArakIta kametI pAwadZAM xe cearamEna niramala sizGa we upa cearamEna ravizxara
29 akAlI xala kOmAMwarI jAMca cOMkIAM nedZe lAegA 3 viSAla Xarane
30 we roka iYka hora mAmale
31 ca 5261 karodZa rupae xIAM sadZakI yojanAvAM nUz manajZUrI
32 84 siYKa kawaleAma xe xoSIAM nUz sajZA xavAuNa laI vikarama sizGa KAlasA xI BuYKa hadZawAla 21veM xina
33 BArawI janawA pAratI we samUha nagara GaYgA vaYloM mArakIta kametI pAwadZAM xe cearamEna niramala sizGa we upa cearamEna ravizxara
34 muzbaI hamale xe pramuYKa sAjZiSI laKavI xI rihAI xA rasawA sAPZaBArawa vaYloM saKZawa rosa xA pragatAvA
35 we 4 jaghA viSAla XaraniAM xI wiArI laI akAlI xala ne cazdIgadZha we azmriwasara
36 rAjasaWAna we sarahizxa PIdara xe kinAriAM nUz majZabUwa banAuNa xA pazjAba awe rAjasaWAna sAMJA nigarAnI prajEkata SurU karanage
37 naSA wasakarI akAlI xala xI xeNa : kevala sizGa DiYloM
38 BArawa sarakAra 1984 xI nasalakuSI xe SikAra siYKAM nUz inasAPZa we jelhAM viYca bazxa siYKAM xI wurazwa rihAI karavAe : jaWe. avawAra sizGa
39 eara eSIA jahAjZa xe wabAha ho ke samuzxara
40 BagodZe karAra xiYwe pravAsI pazjAbIAM xI sucI xI hovegI mudZa samIKiA-muYKa mazwarI vaYloM naveM sAla xA wohaPA
41 siYKa pazWa viYca xaKala-azxAjZI baraxASawa nahIM kIwI jAvegI : giAnI gurabacana sizGa
42 AI.Ema.e. pazjAba xI coNa vi
43 sapIkara dAkatara caranajIwa sizGa atavAla savaYCa siAsawaxAna vajoM sanamAniwa
44 nAnakaSAhI kElazdara nUz lAgU karavAuNa laI pahilI nUz azmriwasara ikaYwarawA
45 ca SAmila hovAMge : mAna
46 azmriwasara, 30 xasazbara (pI. tI.)- ‘‘siYKa kOma xIAM vaYKa-vaYKa XAramika, siAsI awe samAjika jaWebazxIAM vaYloM jo siYKa kOma xI kOmAMwarI paYXara uwe nivekalI we aNaKIlI pahicANa saWApawa karana vAle nAnakasAhI kElazdara nUz pUrana rUpa vica ika maYwa ho ke lAgU karana laI 01 janavarI 2015 nUz srI akAla waKZawa sAhiba ...
47 ApaNIAM haYkI mazgAM laI CewI wiYKA sazGaraSa viYDaNage vEtaranarI izsapEkatara
48 kAnavEMta sakUlAM vaYloM sUbe xI pazjAbI BASA bolaNa
49 we pAbazxI lagAuNA sarAsara galawa : jaWe avawAra sizGa
50 sa. amarajIwa sizGa cAvalA pAratI xI pI.e.sI. xe mEMbara awe mIwa praXAna niyukawa
51 izdolA xe sakina nUz sUta karana vAle kalara xI reMja lAMca kIwI
52 jZilhA prISaxa xI mItizga xOrAna vaYKa-vaYKa mawe pAsa
53 kOmI paSU Xana cEMpIanaSipa pazjAba sarakAra xA supanamaI projEkata : gulajZAra sizGa raNIke
54 britiSa siYKa kOMsala vaYloM pAkisawAnoM ujadZa ke Ae siYKAM xI maxaxa
55 navIM xiYlI, 30 xasazbara (manaprIwa sizGa KAlasA)- pakisawAna vico ujadZa ke Ae siYKa parivAra jo ki BArawa vicale hariANAM xe ParIxAbAxa vica JopadZapaYtI baNa ke SaranAraWI vajoM bahuwa hI bure hAlAwAM vica rahi rahe hana, xI Sosala mIdiA vica vIdio jArI hoNa piCo britISa siYKa kosala valoM maxaxa kIwI gaI ...
56 pazjAba vaYloM srI gurU rAmaxAsa havAI aYde nUz kOmAMwarI keMxara baNAe jANa xI mazga
57 bAxala vaYloM peSa kIwA pazWaka ejazdA hAWI ke xAMwa
58 raGabIra sizGa jOdZA nUz saxamA, mAwA savaragavAsa
59 italI ne jahAjZa woM 265 yAwarIAM nUz bacAiA
60 roma, 29 xasazbara (pI. tI.)- italI xIAM bacAa kArajAM vica laagIAM tImAM ne bIwI rAwa Bara samuzxarI jahAjZa jisa nUz garIsa xe samuzxarI waata nedZe aaga laaga gaI sI woM 265 viakawI nUz suraaKiawa bacAa liA hE para aje vI 200 woM vaXere yAwarI jahAjZa ‘we Pase hoe hana. hElIkApatara ...
61 pAki ne yU Ena xI PAMsI rokaNa xI apIla kIwI ra
62 wavAxIAM xe BuleKe mAre 3 nAgarika
63 kAnUznI KZAmIAM kArana milI jZamAnawa
64 ca ApaNe ledI lava nAla niU Iara manAuNage !
65 muzbaI- bAlIvuYda aBinewA raNabIra kapUra inhIM xinIM naveM sAla xA jaSana manAuNa xI palAnizga kara rahe hana. sUwarAM muwAbaka raNabIra kapUra aBinewarI kEtarInA kEPa nAla lazdana
66 ca navAM sAla manAuNa bAre soca rahe hana. iha xoveM lazdana
67 ca hI milaNage we uWe hI naveM sAla xA jaSama manAuNage.KAla gaYla iha ...
68 palavAna nahIM dAirEkatara baNanA cAhuzxe hana: Amira
69 merIAM PilamAM xeKaNA pasazxa nahIM betI niAsA nUz : kAjola
70 yAmI gOwama : kAmayAbI xA koI PAramUlA nahIM
71 sonama kapUra sZOka KANa xA
72 sarakAra ne vAaxe pUre nA kIwe wAM mudZa azxolana karAMge
73 kabaYdI KidArI xA golI mAra ke kIwA kawala
74 pIdZawa nUz 7 sAla woM hE muAvajZe xA izwajZAra
75 moxI gujarAwa awe yU.pI woM ladZanage loka saBA coNa?
76 bAxalAM vaYloM kOmI KurAka suraYKiA progarAma xA viroXa karanA ximAgI bOKalAhata awe GabarAhata xI nisZAnI : PZawiha bAjavA
77 nAnakaSAhI kElazdara nUz lAgU karavAuNa laI pahilI nUz azmriwasara ikaYwarawA
78 ca SAmila hovAMge : mAna
79 ApaNIAM haYkI mazgAM laI CewI wiYKA sazGaraSa viYDaNage vEtaranarI izsapEkatara
80 britiSa siYKa kOMsala vaYloM pAkisawAnoM ujadZa ke Ae siYKAM xI maxaxa
81 pazjAba vaYloM srI gurU rAmaxAsa havAI aYde nUz kOmAMwarI keMxara baNAe jANa xI mazga
82 kAnavEMta sakUlAM vaYloM sUbe xI pazjAbI BASA bolaNa
83 we pAbazxI lagAuNA sarAsara galawa : jaWe avawAra sizGa
84 nAnakaSAhI kElazdara nUz lAgU karavAuNa laI pahilI nUz azmriwasara ikaYwarawA
85 ca SAmila hovAMge : mAna
86 ApaNIAM haYkI mazgAM laI CewI wiYKA sazGaraSa viYDaNage vEtaranarI izsapEkatara
87 britiSa siYKa kOMsala vaYloM pAkisawAnoM ujadZa ke Ae siYKAM xI maxaxa
88 pazjAba vaYloM srI gurU rAmaxAsa havAI aYde nUz kOmAMwarI keMxara baNAe jANa xI mazga
89 kAnavEMta sakUlAM vaYloM sUbe xI pazjAbI BASA bolaNa
90 we pAbazxI lagAuNA sarAsara galawa : jaWe avawAra sizGa
91 nAnakaSAhI kElazdara nUz lAgU karavAuNa laI pahilI nUz azmriwasara ikaYwarawA
92 ca SAmila hovAMge : mAna
93 ApaNIAM haYkI mazgAM laI CewI wiYKA sazGaraSa viYDaNage vEtaranarI izsapEkatara
94 britiSa siYKa kOMsala vaYloM pAkisawAnoM ujadZa ke Ae siYKAM xI maxaxa
95 pazjAba vaYloM srI gurU rAmaxAsa havAI aYde nUz kOmAMwarI keMxara baNAe jANa xI mazga
96 kAnavEMta sakUlAM vaYloM sUbe xI pazjAbI BASA bolaNa
97 we pAbazxI lagAuNA sarAsara galawa : jaWe avawAra sizGa
98 sa. amarajIwa sizGa cAvalA pAratI xI pI.e.sI. xe mEMbara awe mIwa praXAna niyukawa
99 bAxala vaYloM peSa kIwA pazWaka ejazdA hAWI ke xAMwa
100 izdolA xe sakina nUz sUta karana vAle kalara xI reMja lAMca kIwI
1 wama 35 elYla peVlYli saMxarBaMlonU - anukRaNaM Sawruvulaku malle kAtlAdukovataM baMtIki naccaxu.
2 2007lo ajmIr‌loni KAjA moVyinuxxIn ciRwI xargAlo jarigina peludulo mugguru canipogA, maro 15 maMxi gAyapadina viRayaM weVlisiMxe.
3 gaNiwaM lAMti viRayaM guriMci blAgulu, vIkela xvArA carcalu jaragavaccunani aMxarikI arWamayiMxi.
4 ilA iMwa potIlo, yAMtI britiR vAwAvaraNaMlonu 155 nimuRAla nidivigala ‘hAmleVt’ nilici geVliciMxaMte axi xAni cakkaxanAniki nixarSaname!
5 avakASaM vaccina vAriki eVks‌pojar‌kUdA vaswuMxi.
6 I sinimAki exo natiMcAM aMte natiMcinattuMxi gAnI e oVkka sIn‌lo kUdA navvu weVppiMcalekapoyAdu.
7 AhAraMlo ‘meVgnIRiyaM’ pAwra eVMwa prAXAnyawanu kaligi uMtuMxo ‘sEns‌sEt’ xvArA grahiMcagaligAM.
8 patnAyak xarSakawvaMlo maxxineni rameR nirmiMcina ciwraM ‘brokar’.
9 ’’ ani mAXyamAla prawiniXulu adigina praSnakAni, ‘‘levu’’ ani jiyAMg yU ceVppina samAXAnaM kAni levu!
10 prANAnni, jIviwAnni PaNaMgApeVtti, niwya cEwanyaSIligA rUpAMwaraM ceVMxuwuMxi.
11 oVka roju uxayAnena wammudu annagAri kutIrAniki prakkana vunna mAmidipalYlYanu cUsina annayya viRayaM adigi weVlusukunnAdu.
12 svagrAmaM cittivalasalo bAla sarasvawi nAtyamaMdali, bAla sAviwri nAtya maMdali ane reVMdu nAtaka pariRaw‌lu uMdevani, nA Asakwi gamaniMcina vAru eVnno praxarSanalaku wIsukupoyArani vivariMcArAyana.
13 aMxukani nA rUMlo peVtti BaxraMgA bIgamesinAnu.
14 2005lo seVpteVMbar‌lo keMxra atavI, paryAvaraNa maMwriwvaSAKa nuMdi sWalAniki anumawi laBiMciMxi.
15 kAMgreVs praBuwvAnni pawanaM ceyataM wama uxxeSyaM kAxaMtUne, ixi wama cewilo panenani ceVppakane ceVppataM gamanArhaM.
16 ‘‘vasaMwo asyA sIxAjyam grIRma iXma SSara xXaviH’’ I samaswa sqRti ane yajFAgnini prerepiMce AjyaM vasaMwaM.
17 ippudu vIlYlaki axi weVliswe eVlA?
18 I carcalu emAwraM anukUla PaliwAliccinA hillarI kliMtan uwwama xOwyavewwagA, eVnno xaSAbxAlugA koVrukudu padani samasyaku oVka pariRkArAnni cUpina rAjanIwijFurAligA cariwraputallo migilipovadaM KAyaM!
19 appati wAlUkA kAMgreVs preVsideVMt ayina SrI vAsireVddi xurgA saxASiveSvara prasAx‌gAriwo ‘‘nIvu preVsideVMtuvu kaxA!
20 xIniki vIXinAtika sarEna mArgaMgA Ayana BAviMcAru.
21 maniRi svarUpaM viviXa svarUpAlanu saMwariMcukuMtoMxi.
22 ikkada qRulu kUdA vacci wama sexa wIrcukuMtuMtAru.
23 nI peVlYlYi nI iRtaprakArame jarigiMxi.
24 krismas wAwa koVraku eVxuru cUdaMdi..
25 oVka spUn nUneV lexA neVyyi kalipi pallIlanu veyiswe avi nallagA kAkuMdA uMdadame kAka poVttu sulaBaMgA vaxuluwuMxi.
26 manajAwiki saripadinanna pAlu ivvagala sAmarXyaM mana pAdi paSuvulaku uMxi.
27 ameVrikA byAMkulu vANijya saMsWalu 2008lo xivAlA wIsina kAraNaMgA erpadina ArWika mAMxyaM praBAvaM kramaMgA anni xeSAlakU viswariMciMxi.
28 kAlagawilo kanumarugayye anekAneka paxXawulaku BinnaMgA AnAti goriMta nedu heVnnAgA mAri AXunika yugaMlo awyAXunika PyARan‌gA eVxigiMxi.
29 sAmAjika, ArWikABivqxxi, suparipAlana praXAna aMSaM kAbatti jAwIya prayojanAla xqRtA muMxaduguku ixi maMci mArgaM ani SrIkqRNa kamitI wana 461 pejIla nivexikalo sUciMciMxi.
30 ’’ vinnavAru OnA, nijamenA ani boleVdu AScaryapaddAru.
31 ‘‘vAru ASrama svIkAraM cesi, grAmAnni pariwyajiMci veVlYlipoyAru’’ vAri muKAla lAgAne kaMTaXvani kUdA cinnaboyiMxi.
32 ixxaru mAwrame potI cesinappudu eVvariko oVkariki 301 votlu rAvAli.
33 wakRaNaM xqRti peVttakuMdA vaxileswe naksalEt samasya xeSa AMwaraMgika Baxrawaku peVnumuppugA mAruwuMxanadaMlo eVlAMti saMxehaM lexu.
34 ayiwe Apil E-pyAd‌lo upayogiMce koVnni vidiBAgAlu wEvAn‌lo anukunnameraku uwpawwi jaragalexu.
35 xaSAbxulugA A pani jaragalexu.
36 maMwrivarga nirmANaMlo pUrwi svecCa anuBaviMciMxi aMjayya oVkkare.
37 iMxiramma wamaku wallivaMtixani hasInA prakatiMciMxi.
38 pAMca BOwika SarIraMloMci Awma jyowi mokRapaxAniki prasWAnaM sAgiMcadaM ani xInarWamaMtAru.
39 boVmma gIsinAyanaki purANAlu weVliyavu kAbolu’’ anukunnAru.
40 veVnakabadina prAMwAlalonu ejansI prAMwAlalonu kUdA prEvetu vixya viswariswoMxi.
41 A kaWaki bahumawi vacci xAxApu iravE elYlYayiMxi.
42 rAjakIya BIRmudu nyUDillI, mArci 20: nepAl mAjI praXAni girijAprasAx koVyirAlA mqwiki rARtaprawi prawiBA pAtil, praXAni manmohan siMg pragADa saMwApaM weVlipAru.
43 nirjIvaMgA kanpiMce pattaNa vIXullo I muggure..
44 malYlI viSvaprayawnaM cesi vAraMxarnI XarmEka nirUDamawulanu ceSAnu.
45 AwmarakRaNa ane hakku prawixeSAniki, vyakwiki unnaxi.
46 ceVttumIxi kAyanu samuxraMlo uppunu kalipina brahma, karivepAku lexani kattukunna Alini koVMdakriMxa vuMcina veVMkateSudu garalYAnni miMgina nIlakaMTudu sEwaM pulupulenixe valapu lexani seVlaviccAru.
47 BAraw ippativaraku 114 svarNaM, 152 rajawaM, 228 kAMsyAlawo moVwwaM 494 pawakAlu saMpAxiMciMxi.
48 I edAxi eVsi ammakAlalo 80 SAwaM slpit‌ eVsila ammakAle uMtAyani vivariMcAru.
49 palu blUcip Rerlalo peVxxaeVwwuna jarigina goVnugolYlawo boVMbAyi stAk eVkceMj (bieVs‌i) seVneVsaks maro 133 pAyiMtlu xUsukeVlYliMxi.
50 maniRi jIviwaMlo RApiMg prAmuKyaM eVMwo uMxi.
51 xInivalla lABaMkannA naRtame vuMtuMxani vExyulu ceVbuwunnAru.
52 appudappudu vinipiswUne unnAyi.
53 rAm‌caraN, nAga cEwanya, rAnA agrahIrola vArasulugA saMkrAMwi sakseVs‌ni cavicUdAlsi uMxi.
54 xInini AyuXaMgA upayogiMcukuni wilak sAmAnya prajallo xeSaBakwini wattilepAru.
55 xIniki gasagasAlu, jIdipappulu, bAxaMpappulu xoragA veyiMci kaMgAbaMgA mukkalu cesiMxi xInilo vesi kalapAli.
56 leka vAru praXAna maMwriwo sahakariMcataM lexA?
57 alA eVMxuku racanalu ceyaru?
58 mawAMwara, kulAMwara, xeSAMwara vivAhAlu xIniki kAraNaM kAvaccu.
59 sAXAraNaMgA I kiMxi lakRaNAlu alAMti vArilo kanpiswAyi.
60 kqRNa, maMjula jaMtagA natiMcina ’mAyaxAri malligAdu’ ciwraMlonu jayaMwi veSyapAwra poRiMci amAyakurAlEna malli, kqRNala peVlYli jaripici wana xaggara ASrayamiswuMxi.
61 kAMgreVs‌ku miMcina BAvAlu levu.
62 racayiwa kona veVMkat ‘cAri’ane axBuwamEna pAwranu sqRtiMcAru.
63 tAtA sahAyaMwo veVlugu cUsina prAjeVktuleVnno.
64 peru weVlusukuni palakariMcaMdi?
65 prajA sahakAra samiwi iccina baMx pilupunaku AMXrAvani moVwwaM spaMxiMciMxi.
66 ixemI asAXAraNaMgA manaku kanipiMcakapoyinappatikI, caMxruninuMci BUmini ceradAniki kAMwiki patte samayaM 1.5 seVkaMdlanna viRayAnni gurwiswe xIni prAXAnyawa manaku avagAhanaku vaswuMxi.
67 kAnI ixi nijamEna aMxAlanI, wamaku sarjarI ceyiMcukovAlsina avasaraM lexani koVMxaru vayyArAlu powunnAranI, axi wappani AmeV vApowoMxi.
68 oVmeVgA koVvvu paxArWAlu eVkkuvagA vunna cepalu, bAxaM pappu, akrUt, nuvvulu, poVxxuwirugudu viwwanAlu lAMtivi AhAraMlo wagu mowAxulo uMdelA cUsukovAli.
69 akkada eVxo oVka sanniveSaMlo mIru axqSyarUpaMlo uMdi cUswuMtAru.
70 kAni I kamitIla nivexikalanu xaggaruMcukuni kAlakRepaM ceswe kuxaraxu.
71 mahAmeXAvi, rAy preraNa abbUri rAmakqRNArAvu, ji.
72 haPIj valeVne pilYlayi kUdA mawonmAxAnni reVccagoVduwunnAdanna arXaM vaccelA vyAKyAniMcAdu.
73 manassu yoVkka sWUlAkArame meVxadu.
74 ameVrikA aXyakRudu barAk oVbAmA itIvala mana xeSAniki vaccina saMxarBaMgA I vEparIwyaM mariMwagA prasPutiMciMxi.
75 BArawa xeSaM wama prayojanaM kosame kAkuMdA prapaMcaM prayojanAlanu kApAdeMxuku pAtupadAlani Ayana sUciMcAru.
76 walli sawyavawiwo vyAsudu, kOravula guriMci vIrukrUrulu naSiMcina XarmabaxXamEna nadavadigalavAru (vilupwa XarmAcArulu), ceVdu panulu ceyu svaBAvaM galavAru (asaxqwwulu) akAraNa xveRaMgalavAru (niRkAraNa vErulu) ani ceVppAru.
77 vijayakumAr, anaMwapuraM xoceswunna pAla GarAnAlu kotISvarulavuwunna GarAnA pAla vikrayaxArulu, vAriki aMdaxaMdalugA vunna pAla dErI dEreVktarlu cErman‌lu prajalanu xoceswunnAru.
78 rAjyalakRmi kUdA koVdukulu, kodalYlYanI palakariswU vAlYlYa viSeRAlu, kaburlu vinasAgiMxi.
79 aMwakaMte eVkkuva sEjulo unnA, yUrinarI in‌PeVkRans malYlYI malYlYI vaswunnA xAnni mEnar sarjarI xvArA woVlagiswAmu.
80 rEwulaku saraParA cese moVwwaM eVruvullo nalaBE SAwaM I Ot‌leVts xvArA vikrayiswU, muppE SAwaM mArk‌PeVd xvArA, maro muppE SAwaM dIlarla xvArA saraParA ceswoMxi.
81 maroviXaMgA ceVppAlaMte nirxiRta praNAlYika amalu prakriya sAmAjika paryavekRaNaku lobadi uMdadAniki anuvEna viXAnAlanu rUpoVMxiMcadAniki awyaMwa prAXAnyawa ivvAli.
82 cEnA vEKari xurAkramaNa vAxamenani proVPeVsar devid SAMbOG vaMti ameVrikAku ceVMxina koVMxaru nipuNulu perkoVMtunnAru.
83 xAMwo wanaki asalu viluva lekuMdA powuMxi.
84 akkada kUdA vAtini mareVkkadi nuMco weVcci paMdiMcukunnAru.
85 xAMwo kaWa pUrwayina warvAwa vacce I klEmAks‌pE eVvariki Asakwi uMdaxu.
86 xarSakulu: I saMsWa nirmiMcina 4 sAMGika ciwrAlaku (RAvukAru, peVlYlicesicUdu, missamma, appucesipappukUdu) eVl.
87 xubAyilo xakRiNAPrikAwo jarigina sirIs maXyalo nuMci wAnu veVlYlipovadaMpE vaccina vimarSalapE hExar maMdipaddAru.
88 BayamU, SokamU veVMta uMtAyi.
89 awyaxBuwa Silpi mayudu nuMdi moVxalukuni amaraSilpi jakkanna varaku mana rARtraMlo, xeSaMlo AlayAlu mahonnawaMgA veVligAyani mana cariwra ceVbuwoMxi.
90 anna koNaMlo vicAraNa jarigi uMte bAguMdexi.
91 eV, bi goVttaMguMdA ‘eV’koVnanuMci gAlini lopalaku balaMgA UxAli.
92 pexalaku uxxeSiMcina niXula xurviniyogaM jaruguwoMxi.
93 stAMpulanu sekariMcevAru waracu ‘yupiyu’ ane poVdi akRarAlni A stAMpula mIxa gamaniMci uMtAru.
94 xAMwo rEwula parisWiwi xayanIyaMgA mAruwuMxi.
95 alAge AkASavANi, xUraxarSan keMxrAllo e gred vayoVlin vixvAMsudigA prasixXipoVMxina peVravali naMxakumAr wana vayoVlin vinyAsaMwo nAxawaraMgAlanu prawiXvaniMpajesi Srowalanu raMjiMpajeSAru.
96 pEgA I pAwra A paxinimuRAla samayaMlo svayAnA wana Barwane caMpe prayawnAlu ceswuMdadaM eVbbeVttugA anipiswuMxi.
97 myAjik eVlA ceswAro oVka meVjiRiyan ceVpwAdA?
98 BArawaxeSaMlo awadu leni xevAlayaM lexu.
99 yUrap‌loni lotuwo naduswunna xeSAlu ixivarake wama xigumawulanu waggiMcukovadAniki caryalu cepattAyi.
100 parIkRallo maMci PaliwAlu sAXiMcadaMlo vixyArWi caxavadAniki ketAyiMcina samayaM, caxivina viXAnaM vaMti aneka aMSAlu praBAvaM cUpuwAyi.
यों RB
सिंगल JJ
स्क्रीन NNC
थिएटर NNP
के PSP
दर्शकों NN
को PSP
अग्निपथ NNP
अधिक QF
नहीं NEG
भा VM
सकी VAUX
। SYM
विद्या NNPC
बालन NNP
की PSP
फिल्म NN
द NNPC
डर्टी NNPC
पिक्चर NNP
के PSP
वीकएंड NNC
कलेक्शन NN
से PSP
प्रतिद्वंदी NN
और CC
फिल्म NNC
इंडस्ट्री NN
हैरत NN
में PSP
है VM
। SYM
द NNPC
डर्टी NNPC
पिक्चर NNP
के PSP
प्रचार NN
और CC
जिज्ञासा NN
से PSP
फिल्म NN
की PSP
जबरदस्त JJ
ओपनिंग NN
की PSP
उम्मीद NN
थी VM
, SYM
लेकिन CC
वीकएंड NN
में PSP
30 QCC
करोड़ QC
से PSP
अधिक NN
के PSP
कलेक्शन NN
की PSP
बात NN
ट्रेड NNC
पंडितों NN
ने PSP
भी RP
नहीं NEG
सोची VM
थी VAUX
। SYM
चूंकि CC
द NNPC
डर्टी NNPC
पिक्चर NNP
नायिका JJC
प्रधान JJ
फिल्म NN
है VM
और CC
माना VM
जाता VAUX
है VAUX
कि CC
देश NN
के PSP
दर्शक NN
ऐसी DEM
फिल्में NN
अधिक QF
पसंद NN
नहीं NEG
करते VM
। SYM
विद्या NNPC
बालन NNP
ने PSP
अपने PRP
आकर्षण NN
और CC
package ILMT::KAN::HIN::WX2UTF;
use strict;
use warnings;
use Dir::Self;
use Data::Dumper;
my %daemons = (
"wx2utf" => {
"path" => "converter-indic",
"args" => "--l kan --s wx --m --daemonize --port",
"port" => "8188"
}
);
sub process {
my %args = @_;
utf8::encode($args{data});
my $result = call_daemon("wx2utf", $args{data});
utf8::decode($result);
return $result;
};
sub run_daemons {
my @daemon_names = @_;
foreach my $daemon_name (@daemon_names) {
my %daemon = %{$daemons{$daemon_name}};
my $cmd = "$daemon{path} $daemon{args} $daemon{port} &";
my $runfile = __DIR__ . "/run/${daemon_name}_$daemon{port}";
system("flock -e -w 0.01 $runfile -c '$cmd'") == 0
or warn "[" . __PACKAGE__ . "]: Port $daemon{port} maybe unavailable! $?\n";
}
}
sub call_daemon {
my ($daemon_name, $input) = @_;
my $port = $daemons{$daemon_name}{port};
my ($socket, $client_socket);
$socket = new IO::Socket::INET (
PeerHost => '127.0.0.1',
PeerPort => $port,
Proto => 'tcp',
) or die "ERROR in Socket Creation : $!\n";
$socket->send("$input\n");
my $result = "";
while (my $line = $socket->getline) {
$result .= $line;
}
$socket->close();
return $result;
}
run_daemons(("wx2utf"));
1;
# Ignore everything in this directory
*
# Except this file
!.gitignore
#!/bin/bash
set -e
sudo -E python2 -m pip install -U ./dependencies/indic-wx-converter
......@@ -134,7 +134,7 @@ if( lc($src) eq "wx" and lc($tgt) eq "utf")
foreach $field (@lex_root)
{
# if lcat is punc then don't convert it
if ($cat_root[0] ne "punc" and $lex_root[0] !~ /^\^?\@.*/)
if ($cat_root[0] ne "punc" and $lex_root[0] !~ /^\^\/?\@.*/)
{
$val_out = &wx2utf($field, $lang);
my @lex_arr=();
......
......@@ -689,89 +689,86 @@ sub iscii2unicode_kan {
my $string = $_[0];
my %hash = (
"\x{A2}"=>"\x{0C82}", #Vowel-modifier ANUSWAR
"\x{A3}"=>"\x{0C83}", #Vowel-modifier VISARG
"\x{A4}"=>"\x{0C85}", #Vowel A
"\x{A5}"=>"\x{0C86}", #Vowel AA
"\x{A6}"=>"\x{0C87}", #Vowel I
"\x{A7}"=>"\x{0C88}", #Vowel II
"\x{A8}"=>"\x{0C89}", #Vowel U
"\x{A9}"=>"\x{0C8A}", #Vowel UU
"\x{AA}"=>"\x{0C8B}", #Vowel RI
"\x{AB}"=>"\x{0C0E}", #Vowel E
"\x{AC}"=>"\x{0C0F}", #Vowel EY
"\x{AD}"=>"\x{0C10}", #Vowel AI
"\x{B2}"=>"\x{0C0D}", #Vowel AYE (Devanagari Script)
"\x{AF}"=>"\x{0C12}", #Vowel O
"\x{B0}"=>"\x{0C13}", #Vowel OW
"\x{B1}"=>"\x{0C14}", #Vowel AU
"\x{B2}"=>"\x{0C11}", #Vowel AWE
"\x{B3}"=>"\x{0C15}", #Consonant KA
"\x{B4}"=>"\x{0C16}", #Consonant KHA
"\x{B5}"=>"\x{0C17}", #Consonant GA
"\x{B6}"=>"\x{0C18}", #Consonant GHA
"\x{B7}"=>"\x{0C19}", #Consonant NGA
"\x{B8}"=>"\x{0C1A}", #Consonant CHA
"\x{B9}"=>"\x{0C1B}", #Consonant CHHA
"\x{BA}"=>"\x{0C1C}", #Consonant JA
"\x{BB}"=>"\x{0C1D}", #Consonant JHA
"\x{BC}"=>"\x{0C1E}", #Consonant JNA
"\x{BD}"=>"\x{0C1F}", #Consonant Hard TA
"\x{BE}"=>"\x{0C20}", #Consonant Hard THA
"\x{BF}"=>"\x{0C21}", #Consonant Hard DA
"\x{C0}"=>"\x{0C22}", #Consonant Hard DHA
"\x{C1}"=>"\x{0C23}", #Consonant Hard NA
"\x{C2}"=>"\x{0C24}", #Consonant Soft TA
"\x{C3}"=>"\x{0C25}", #Consonant Soft THA
"\x{C4}"=>"\x{0C26}", #Consonant Soft DA
"\x{C5}"=>"\x{0C27}", #Consonant Soft DHA
"\x{C6}"=>"\x{0C28}", #Consonant Soft NA
"\x{C7}"=>"\x{0C29}", #Consonant NA (Tamil)
"\x{C8}"=>"\x{0C2A}", #Consonant PA
"\x{C9}"=>"\x{0C2B}", #Consonant PHA
"\x{CA}"=>"\x{0C2C}", #Consonant BA
"\x{CB}"=>"\x{0C2D}", #Consonant BHA
"\x{CC}"=>"\x{0C2E}", #Consonant MA
"\x{CD}"=>"\x{0C2F}", #Consonant YA
"\x{CE}"=>"", #Consonant JYA (Bangla, Assamese & Orriya)
"\x{CF}"=>"\x{0C30}", #Consonant RA
"\x{D0}"=>"\x{0C31}", #Consonant Hard RA (Southern Script)
"\x{D1}"=>"\x{0C32}", #Consonant LA
"\x{D2}"=>"\x{0C33}", #Consonant Hard LA
"\x{D3}"=>"\x{0C34}", #Consonant ZHA (Tamil & Malyalam)
"\x{D4}"=>"\x{0C35}", #Consonant VA
"\x{D5}"=>"\x{0C36}", #Consonant SHA
"\x{D6}"=>"\x{0C37}", #Consonant Hard SHA
"\x{D7}"=>"\x{0C38}", #Consonant SA
"\x{D8}"=>"\x{0C39}", #Consonant HA
"\x{D9}"=>"", #Consonant INV
"\x{DA}"=>"\x{0C3E}", #Vowel Sign AA
"\x{DB}"=>"\x{0C3F}", #Vowel Sign I
"\x{DC}"=>"\x{0C40}", #Vowel Sign II
"\x{DD}"=>"\x{0C41}", #Vowel Sign U
"\x{DE}"=>"\x{0C42}", #Vowel Sign UU
"\x{DF}"=>"\x{0C43}", #Vowel Sign RI
"\x{E0}"=>"\x{0C46}", #Vowel Sign E (Southern Scripts)
"\x{E1}"=>"\x{0C47}", #Vowel Sign EY
"\x{E2}"=>"\x{0C48}", #Vowel Sign AI
"\x{E3}"=>"\x{0C45}", #Vowel Sign AYE (Devanagari Script)
"\x{E4}"=>"\x{0C4A}", #Vowel Sign O
"\x{E5}"=>"\x{0C4B}", #Vowel Sign OW
"\x{E6}"=>"\x{0C4C}", #Vowel Sign AU
"\x{E7}"=>"\x{0C49}", #Vowel Sign AWE (Devanagari Script)
"\x{E8}"=>"\x{0C4D}", #Vowel Omission Sign (Halant)
"\x{E9}"=>"\x{0C3C}", #Diacritic Sign (Nukta)
"\x{EA}"=>"\x{0C64}", #Full Stop (Viram, Northern Scripts)
"\x{F1}"=>"\x{0C66}", #Digit 0
"\x{F2}"=>"\x{0C67}", #Digit 1
"\x{F3}"=>"\x{0C68}", #Digit 2
"\x{F4}"=>"\x{0C69}", #Digit 3
"\x{F5}"=>"\x{0C6A}", #Digit 4
"\x{F6}"=>"\x{0C6B}", #Digit 5
"\x{F7}"=>"\x{0C6C}", #Digit 6
"\x{F8}"=>"\x{0C6D}", #Digit 7
"\x{F9}"=>"\x{0C6E}", #Digit 8
"\x{FA}"=>"\x{0C6F}", #Digit 9
"\x{A2}"=>"\x{0C82}",
"\x{A3}"=>"\x{0C83}",
"\x{A4}"=>"\x{0C85}",
"\x{A5}"=>"\x{0C86}",
"\x{A6}"=>"\x{0C87}",
"\x{A7}"=>"\x{0C88}",
"\x{A8}"=>"\x{0C89}",
"\x{A9}"=>"\x{0C8A}",
"\x{AA}"=>"\x{0C8B}",
"\x{AE}"=>"\x{0C8D}",
"\x{AB}"=>"\x{0C8E}",
"\x{AC}"=>"\x{0C8F}",
"\x{AD}"=>"\x{0C90}",
"\x{B2}"=>"\x{0C91}",
"\x{AF}"=>"\x{0C92}",
"\x{B0}"=>"\x{0C93}",
"\x{B1}"=>"\x{0C94}",
"\x{B3}"=>"\x{0C95}",
"\x{B4}"=>"\x{0C96}",
"\x{B5}"=>"\x{0C97}",
"\x{B6}"=>"\x{0C98}",
"\x{B7}"=>"\x{0C99}",
"\x{B8}"=>"\x{0C9A}",
"\x{B9}"=>"\x{0C9B}",
"\x{BA}"=>"\x{0C9C}",
"\x{BB}"=>"\x{0C9D}",
"\x{BC}"=>"\x{0C9E}",
"\x{BD}"=>"\x{0C9F}",
"\x{BE}"=>"\x{0CA0}",
"\x{BF}"=>"\x{0CA1}",
"\x{C0}"=>"\x{0CA2}",
"\x{C1}"=>"\x{0CA3}",
"\x{C2}"=>"\x{0CA4}",
"\x{C3}"=>"\x{0CA5}",
"\x{C4}"=>"\x{0CA6}",
"\x{C5}"=>"\x{0CA7}",
"\x{C6}"=>"\x{0CA8}",
"\x{C7}"=>"\x{0CA9}",
"\x{C8}"=>"\x{0CAA}",
"\x{C9}"=>"\x{0CAB}",
"\x{CA}"=>"\x{0CAC}",
"\x{CB}"=>"\x{0CAD}",
"\x{CC}"=>"\x{0CAE}",
"\x{CD}"=>"\x{0CAF}",
"\x{CF}"=>"\x{0CB0}",
"\x{D0}"=>"\x{0CB1}",
"\x{D1}"=>"\x{0CB2}",
"\x{D2}"=>"\x{0CB3}",
"\x{D3}"=>"\x{0CB4}",
"\x{D4}"=>"\x{0CB5}",
"\x{D5}"=>"\x{0CB6}",
"\x{D6}"=>"\x{0CB7}",
"\x{D7}"=>"\x{0CB8}",
"\x{D8}"=>"\x{0CB9}",
"\x{E9}"=>"\x{0CBC}",
"\x{DA}"=>"\x{0CBE}",
"\x{DB}"=>"\x{0CBF}",
"\x{DC}"=>"\x{0CC0}",
"\x{DD}"=>"\x{0CC1}",
"\x{DE}"=>"\x{0CC2}",
"\x{DF}"=>"\x{0CC3}",
"\x{E0}"=>"\x{0CC6}",
"\x{E1}"=>"\x{0CC7}",
"\x{E2}"=>"\x{0CC8}",
"\x{E7}"=>"\x{0CC9}",
"\x{E4}"=>"\x{0CCA}",
"\x{E5}"=>"\x{0CCB}",
"\x{E6}"=>"\x{0CCC}",
"\x{E8}"=>"\x{0CCD}",
"\x{EA}"=>".", #Full Stop (Viram, Northern Scripts)
"\x{F1}"=>"\x{0CE6}",
"\x{F2}"=>"\x{0CE7}",
"\x{F3}"=>"\x{0CE8}",
"\x{F4}"=>"\x{0CE9}",
"\x{F5}"=>"\x{0CEA}",
"\x{F6}"=>"\x{0CEB}",
"\x{F7}"=>"\x{0CEC}",
"\x{F8}"=>"\x{0CED}",
"\x{F9}"=>"\x{0CEE}",
"\x{FA}"=>"\x{0CEF}",
);
$string=~s/([\x{A1}-\x{FA}])/$hash{$1}/g;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment