#!/usr/bin/env python # -*- coding=utf-8 -*- import re import sys import os.path import argparse class tokenize_ind(): def __init__(self, lang='hin', split_sen=False): self.lang = lang self.split_sen = split_sen file_path = os.path.abspath(__file__).rpartition('/')[0] self.ben = lang in ["ben", "asm"] self.urd = lang in ['urd', 'kas'] self.dev = lang in ["hin", "mar", "nep", "bod", "kok"] self.tam = lang == 'tam' self.tel = lang == 'tel' self.mal = lang == 'mal' self.kan = lang == 'kan' self.guj = lang == 'guj' self.pan = lang == 'pan' self.ori = lang == 'ori' #load nonbreaking prefixes from file self.NBP = dict() with open('%s/data/NONBREAKING_PREFIXES' %file_path) as fp: for line in fp: if line.startswith('#'): continue if '#NUMERIC_ONLY#' in line: self.NBP[line.replace('#NUMERIC_ONLY#', '').split()[0]] = 2 else: self.NBP[line.strip()] = 1 #precompile regexes self.fit() def fit(self): # remove junk characters self.junk = re.compile('[\x00-\x1f]') # seperate out on Latin-1 supplementary characters self.latin = re.compile(u'([\xa1-\xbf\xd7\xf7])') # seperate out on general unicode punctituations except "’" self.upunct = re.compile(u'([\u2012-\u2018\u201a-\u206f])') # seperate out on unicode mathematical operators self.umathop = re.compile(u'([\u2200-\u2211\u2213-\u22ff])') # seperate out on unicode fractions self.ufrac = re.compile(u'([\u2150-\u2160])') # seperate out on unicode superscripts and subscripts self.usupsub = re.compile(u'([\u2070-\u209f])') # seperate out on unicode currency symbols self.ucurrency = re.compile(u'([\u20a0-\u20cf])') # seperate out all "other" ASCII special characters self.specascii = re.compile(r'([\\!@#$%^&*()_+={\[}\]|";:<>?`~/])') #self.specascii = re.compile(u"([^\u0080-\U0010ffffa-zA-Z0-9\s\.',-])") #keep multiple dots together self.multidot = re.compile(r'(\.\.+)([^\.])') if self.urd: #keep multiple dots (urdu-dots) together self.multidot_urd = re.compile(u'(\u06d4\u06d4+)([^\u06d4])') else: #keep multiple purna-viram together self.multiviram = re.compile(u'(\u0964\u0964+)([^\u0964])') #keep multiple purna deergh-viram together self.multidviram = re.compile(u'(\u0965\u0965+)([^\u0965])') #split contractions right (both "'" and "’") self.numcs = re.compile(u"([0-9\u0966-\u096f])(['\u2019])s") self.aca = re.compile(u"([a-zA-Z\u0080-\u024f])(['\u2019])([a-zA-Z\u0080-\u024f])") self.acna = re.compile(u"([a-zA-Z\u0080-\u024f])(['\u2019])([^a-zA-Z\u0080-\u024f])") self.nacna = re.compile(u"([^a-zA-Z\u0080-\u024f])(['\u2019])([^a-zA-Z\u0080-\u024f])") self.naca = re.compile(u"([^a-zA-Z0-9\u0966-\u096f\u0080-\u024f])(['\u2019])([a-zA-Z\u0080-\u024f])") #multiple hyphens self.multihyphen = re.compile('(-+)') #restore multi-dots self.restoredots = re.compile(r'(DOT)(\1*)MULTI') if self.urd: self.restoreudots = re.compile(r'(DOTU)(\1*)MULTI') else: self.restoreviram = re.compile(r'(PNVM)(\1*)MULTI') self.restoredviram = re.compile(r'(DGVM)(\1*)MULTI') #split sentences if self.urd: self.splitsenur1 = re.compile(u' ([.?\u06d4]) ([\u0617-\u061a\u0620-\u065f\u066e-\u06d3\u06d5\u06fa-\u06ff\u201d\u2019A-Z])') self.splitsenur2 = re.compile(u' ([.?\u06d4]) ([\)\}\]\'"\u2018\u201c> ]+) ') else: self.splitsenir1 = re.compile(u' ([|.?\u0964\u0965]) ([\u0900-\u0d7f\u201c\u2018A-Z])') self.splitsenir2 = re.compile(u' ([|.?\u0964\u0965]) ([\)\}\]\'"\u2019\u201d> ]+) ') def normalize(self,text): """ Performs some common normalization, which includes: - Byte order mark, word joiner, etc. removal - ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal - ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces """ text=text.replace(u'\uFEFF', '') #BYTE_ORDER_MARK text=text.replace(u'\uFFFE', '') #BYTE_ORDER_MARK_2 text=text.replace(u'\u2060', '') #WORD_JOINER text=text.replace(u'\u00AD', '') #SOFT_HYPHEN text=text.replace(u'\u200B', ' ') #ZERO_WIDTH_SPACE text=text.replace(u'\u00A0', ' ') #NO_BREAK_SPACE text=text.replace(u'\u200D', '') #ZERO_WIDTH_JOINER text=text.replace(u'\u200C', '') #ZERO_WIDTH_NON_JOINER return text def tokenize(self, text): text = text.decode('utf-8', 'ignore') text = self.normalize(text) text = ' %s ' %(text) # remove junk characters text = self.junk.sub('', text) # seperate out on Latin-1 supplementary characters text = self.latin.sub(r' \1 ', text) # seperate out on general unicode punctituations except "’" text = self.upunct.sub(r' \1 ', text) # seperate out on unicode mathematical operators text = self.umathop.sub(r' \1 ', text) # seperate out on unicode fractions text = self.ufrac.sub(r' \1 ', text) # seperate out on unicode superscripts and subscripts text = self.usupsub.sub(r' \1 ', text) # seperate out on unicode currency symbols text = self.ucurrency.sub(r' \1 ', text) # seperate out all "other" ASCII special characters text = self.specascii.sub(r' \1 ', text) #keep multiple dots together text = self.multidot.sub(lambda m: r' %sMULTI %s' %('DOT'*len(m.group(1)), m.group(2)), text) if self.urd: #keep multiple dots (urdu-dots) together text = self.multidot_urd.sub(lambda m: r' %sMULTI %s' %('DOTU'*len(m.group(1)), m.group(2)), text) else: #keep multiple purna-viram together text = self.multiviram.sub(lambda m: r' %sMULTI %s' %('PNVM'*len(m.group(1)), m.group(2)), text) #keep multiple purna deergh-viram together text = self.multidviram.sub(lambda m: r' %sMULTI %s' %('DGVM'*len(m.group(1)), m.group(2)), text) #split contractions right (both "'" and "’") text = self.nacna.sub(r"\1 \2 \3", text) text = self.naca.sub(r"\1 \2 \3", text) text = self.acna.sub(r"\1 \2 \3", text) text = self.aca.sub(r"\1 \2\3", text) text = self.numcs.sub(r"\1 \2s", text) text = text.replace("''", " ' ' ") #handle non-breaking prefixes words = text.split() text_len = len(words) - 1 text = str() for i,word in enumerate(words): if word.endswith('.'): dotless = word[:-1] if dotless.isdigit(): word = dotless + ' .' elif ('.' in dotless and re.search('[a-zA-Z]', dotless)) or \ self.NBP.get(dotless, 0) == 1 or (i