# how to run the code
# python3 tokenizer_for_indian_languages_on_files.py --input InputFolder --output OutputFolder --lang 0
# lang parameter is a 2-digit ISO 639-1 code for languages.
# for languages ['hi', 'or', 'mn', 'as', 'bn', 'pa'], purna biram as sentence end marker, lang = 0
# for Urdu, '۔' as sentence end marker, lang = 1
# for languages ['en', 'gu', 'mr', 'ml', 'kn', 'te', 'ta'], '.' as sentence end marker, lang = 2
# works at folder and file level
import re
import argparse
import os
# patterns for tokenization
token_specification = [
('datemonth',
r'^(0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])[-\/\.](1|2)\d\d\d$'),
('monthdate',
r'^(0?[1-9]|[12][0-9]|3[01])[-\/\.](0?[1-9]|1[012])[-\/\.](1|2)\d\d\d$'),
('yearmonth',
r'^((1|2)\d\d\d)[-\/\.](0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])'),
('EMAIL1', r'([\w\.])+@(\w)+\.(com|org|co\.in)$'),
('url1', r'(www\.)([-a-z0-9]+\.)*([-a-z0-9]+.*)(\/[-a-z0-9]+)*/i'),
('url', r'/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i'),
('BRACKET', r'[\(\)\[\]\{\}]'), # Brackets
('NUMBER', r'^(\d+)([,\.]\d+)*(\w)*'), # Integer or decimal number
('ASSIGN', r'[~:]'), # Assignment operator
('END', r'[;!_]'), # Statement terminator
('EQUAL', r'='), # Equals
('OP', r'[+*\/\-]'), # Arithmetic operators
('QUOTES', r'[\"\'‘’]'), # quotes
('Fullstop', r'(\.+)$'),
('ellips', r'\.(\.)+'),
('HYPHEN', r'[-+\|+]'),
('Slashes', r'[\\\/]'),
('COMMA12', r'[,%]'),
('hin_stop', r'।'),
('quotes_question', r'[”\?]'),
('hashtag', r'#')
]
# compile regular expressions
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
get_token = re.compile(tok_regex)
def tokenize(list_s):
"""Tokenize a list of tokens."""
tkns = []
for wrds in list_s:
wrds_len = len(wrds)
initial_pos = 0
end_pos = 0
while initial_pos <= (wrds_len-1):
mo = get_token.match(wrds, initial_pos)
if mo is not None and len(mo.group(0)) == wrds_len:
tkns.append(wrds)
initial_pos = wrds_len
else:
match_out = get_token.search(wrds, initial_pos)
if match_out is not None:
end_pos = match_out.end()
if match_out.lastgroup == "NUMBER":
aa = wrds[initial_pos:(end_pos)]
else:
aa = wrds[initial_pos:(end_pos - 1)]
if aa != '':
tkns.append(aa)
if match_out.lastgroup != "NUMBER":
tkns.append(match_out.group(0))
initial_pos = end_pos
else:
tkns.append(wrds[initial_pos:])
initial_pos = wrds_len
return tkns
def read_file_and_tokenize(input_file, output_file, lang_type):
"""Read file and tokenize."""
string_sentences = ''
file_read = open(input_file, 'r', encoding='utf-8')
text = file_read.read().strip().replace(u'0xff', '')
if lang_type == 0:
sentences = re.findall('.*?।|.*?\n', text + '\n', re.UNICODE)
endMarkers = ['?', '।', '!', '|']
elif lang_type == 1:
sentences = re.findall('.*?\n', text + '\n', re.UNICODE)
endMarkers = ['؟', '!', '|', '۔']
else:
sentences = re.findall('.*?\n', text + '\n', re.UNICODE)
endMarkers = ['?', '.', '!', '|']
count_sentence = 1
for index, sentence in enumerate(sentences):
if sentence.strip() != '':
list_tokens = tokenize(sentence.split())
end_sentence_markers = [index + 1 for index, token in enumerate(list_tokens) if token in ['?', '.', '۔', '؟', '।', '!', '|']]
if len(end_sentence_markers) > 0:
if end_sentence_markers[-1] != len(list_tokens):
end_sentence_markers += [len(list_tokens)]
end_sentence_markers_with_sentence_end_positions = [0] + end_sentence_markers
sentence_boundaries = list(zip(end_sentence_markers_with_sentence_end_positions, end_sentence_markers_with_sentence_end_positions[1:]))
for start, end in sentence_boundaries:
individual_sentence = list_tokens[start: end]
string_sentences += '\n'
mapped_tokens = list(map(lambda token_index: str(
token_index[0] + 1) + '\t' + token_index[1].strip() + '\tunk', list(enumerate(individual_sentence))))
string_sentences += '\n'.join(mapped_tokens) + \
'\n\n\n'
count_sentence += 1
else:
string_sentences += '\n'
mapped_tokens = list(map(lambda token_index: str(
token_index[0] + 1) + '\t' + token_index[1].strip() + '\tunk', list(enumerate(list_tokens))))
string_sentences += '\n'.join(mapped_tokens) + \
'\n\n\n'
count_sentence += 1
write_data_to_file(output_file, string_sentences)
def write_data_to_file(output_file, data):
"""Write data to file."""
with open(output_file, 'w', encoding='utf-8') as file_write:
file_write.write(data + '\n')
def main():
"""Pass arguments and call functions here."""
parser = argparse.ArgumentParser()
parser.add_argument(
'--input', dest='inp', help="enter the input file path")
parser.add_argument(
'--output', dest='out', help="enter the output file path")
parser.add_argument(
'--lang', dest='lang', help="enter the language: two digit ISO code")
args = parser.parse_args()
if os.path.isdir(args.inp) and not os.path.isdir(args.out):
os.makedirs(args.out)
if args.lang in ['hi', 'or', 'mn', 'as', 'bn', 'pa']:
lang = 0
elif args.lang == 'ur':
lang = 1
elif args.lang in ['en', 'gu', 'mr', 'ml', 'kn', 'te', 'ta']:
lang = 2
else:
lang = 0
if os.path.isdir(args.inp):
for root, dirs, files in os.walk(args.inp):
for fl in files:
input_path = os.path.join(root, fl)
output_path = os.path.join(args.out, fl)
read_file_and_tokenize(input_path, output_path, lang)
else:
read_file_and_tokenize(args.inp, args.out, lang)
if __name__ == '__main__':
main()