# how to run the code # python3 tokenizer_for_indian_languages_on_files.py --input InputFolder --output OutputFolder --lang 0 # lang parameter is a 2-digit ISO 639-1 code for languages. # for languages ['hi', 'or', 'mn', 'as', 'bn', 'pa'], purna biram as sentence end marker, lang = 0 # for Urdu, '۔' as sentence end marker, lang = 1 # for languages ['en', 'gu', 'mr', 'ml', 'kn', 'te', 'ta'], '.' as sentence end marker, lang = 2 # works at folder and file level import re import argparse import os # patterns for tokenization token_specification = [ ('datemonth', r'^(0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])[-\/\.](1|2)\d\d\d$'), ('monthdate', r'^(0?[1-9]|[12][0-9]|3[01])[-\/\.](0?[1-9]|1[012])[-\/\.](1|2)\d\d\d$'), ('yearmonth', r'^((1|2)\d\d\d)[-\/\.](0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])'), ('EMAIL1', r'([\w\.])+@(\w)+\.(com|org|co\.in)$'), ('url1', r'(www\.)([-a-z0-9]+\.)*([-a-z0-9]+.*)(\/[-a-z0-9]+)*/i'), ('url', r'/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i'), ('BRACKET', r'[\(\)\[\]\{\}]'), # Brackets ('NUMBER', r'^(\d+)([,\.]\d+)*(\w)*'), # Integer or decimal number ('ASSIGN', r'[~:]'), # Assignment operator ('END', r'[;!_]'), # Statement terminator ('EQUAL', r'='), # Equals ('OP', r'[+*\/\-]'), # Arithmetic operators ('QUOTES', r'[\"\'‘’]'), # quotes ('Fullstop', r'(\.+)$'), ('ellips', r'\.(\.)+'), ('HYPHEN', r'[-+\|+]'), ('Slashes', r'[\\\/]'), ('COMMA12', r'[,%]'), ('hin_stop', r'।'), ('quotes_question', r'[”\?]'), ('hashtag', r'#') ] # compile regular expressions tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) get_token = re.compile(tok_regex) def tokenize(list_s): """Tokenize a list of tokens.""" tkns = [] for wrds in list_s: wrds_len = len(wrds) initial_pos = 0 end_pos = 0 while initial_pos <= (wrds_len-1): mo = get_token.match(wrds, initial_pos) if mo is not None and len(mo.group(0)) == wrds_len: tkns.append(wrds) initial_pos = wrds_len else: match_out = get_token.search(wrds, initial_pos) if match_out is not None: end_pos = match_out.end() if match_out.lastgroup == "NUMBER": aa = wrds[initial_pos:(end_pos)] else: aa = wrds[initial_pos:(end_pos - 1)] if aa != '': tkns.append(aa) if match_out.lastgroup != "NUMBER": tkns.append(match_out.group(0)) initial_pos = end_pos else: tkns.append(wrds[initial_pos:]) initial_pos = wrds_len return tkns def read_file_and_tokenize(input_file, output_file, lang_type): """Read file and tokenize.""" string_sentences = '' file_read = open(input_file, 'r', encoding='utf-8') text = file_read.read().strip().replace(u'0xff', '') if lang_type == 0: sentences = re.findall('.*?।|.*?\n', text + '\n', re.UNICODE) endMarkers = ['?', '।', '!', '|'] elif lang_type == 1: sentences = re.findall('.*?\n', text + '\n', re.UNICODE) endMarkers = ['؟', '!', '|', '۔'] else: sentences = re.findall('.*?\n', text + '\n', re.UNICODE) endMarkers = ['?', '.', '!', '|'] count_sentence = 1 for index, sentence in enumerate(sentences): if sentence.strip() != '': list_tokens = tokenize(sentence.split()) end_sentence_markers = [index + 1 for index, token in enumerate(list_tokens) if token in ['?', '.', '۔', '؟', '।', '!', '|']] if len(end_sentence_markers) > 0: if end_sentence_markers[-1] != len(list_tokens): end_sentence_markers += [len(list_tokens)] end_sentence_markers_with_sentence_end_positions = [0] + end_sentence_markers sentence_boundaries = list(zip(end_sentence_markers_with_sentence_end_positions, end_sentence_markers_with_sentence_end_positions[1:])) for start, end in sentence_boundaries: individual_sentence = list_tokens[start: end] string_sentences += '\n' mapped_tokens = list(map(lambda token_index: str( token_index[0] + 1) + '\t' + token_index[1].strip() + '\tunk', list(enumerate(individual_sentence)))) string_sentences += '\n'.join(mapped_tokens) + \ '\n\n\n' count_sentence += 1 else: string_sentences += '\n' mapped_tokens = list(map(lambda token_index: str( token_index[0] + 1) + '\t' + token_index[1].strip() + '\tunk', list(enumerate(list_tokens)))) string_sentences += '\n'.join(mapped_tokens) + \ '\n\n\n' count_sentence += 1 write_data_to_file(output_file, string_sentences) def write_data_to_file(output_file, data): """Write data to file.""" with open(output_file, 'w', encoding='utf-8') as file_write: file_write.write(data + '\n') def main(): """Pass arguments and call functions here.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='inp', help="enter the input file path") parser.add_argument( '--output', dest='out', help="enter the output file path") parser.add_argument( '--lang', dest='lang', help="enter the language: two digit ISO code") args = parser.parse_args() if os.path.isdir(args.inp) and not os.path.isdir(args.out): os.makedirs(args.out) if args.lang in ['hi', 'or', 'mn', 'as', 'bn', 'pa']: lang = 0 elif args.lang == 'ur': lang = 1 elif args.lang in ['en', 'gu', 'mr', 'ml', 'kn', 'te', 'ta']: lang = 2 else: lang = 0 if os.path.isdir(args.inp): for root, dirs, files in os.walk(args.inp): for fl in files: input_path = os.path.join(root, fl) output_path = os.path.join(args.out, fl) read_file_and_tokenize(input_path, output_path, lang) else: read_file_and_tokenize(args.inp, args.out, lang) if __name__ == '__main__': main()