tokenizer_for_file.py 5.36 KB
Newer Older
Pruthwik's avatar
Pruthwik committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
# how to run the code
# python tokenizer_for_file.py --input InputFileName --output OutputFileName
# Author Darshan and Pruthwik
import re
import argparse


token_specification = [
    ('datemonth',
     r'^(0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])[-\/\.](1|2)\d\d\d$'),
    ('monthdate',
     r'^(0?[1-9]|[12][0-9]|3[01])[-\/\.](0?[1-9]|1[012])[-\/\.](1|2)\d\d\d$'),
    ('yearmonth',
     r'^((1|2)\d\d\d)[-\/\.](0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])'),
    ('EMAIL1', r'([\w\.])+@(\w)+\.(com|org|co\.in)$'),
    ('url1', r'(www\.)([-a-z0-9]+\.)*([-a-z0-9]+.*)(\/[-a-z0-9]+)*/i'),
    ('url', r'/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i'),
    ('BRACKET', r'[\(\)\[\]\{\}]'),       # Brackets
    ('NUMBER', r'^(\d+)([,\.]\d+)*(\S+)*'),  # Integer or decimal number
    # ('NUMBER', r'^(\d+)([,\.]\d+)*(\S+)*'),  # Integer or decimal number
    ('ASSIGN', r'[~:]'),          # Assignment operator
    ('END', r'[;!_]'),           # Statement terminator
    ('EQUAL', r'='),   # Equals
    ('OP', r'[+*\/\-]'),    # Arithmetic operators
    ('QUOTES', r'[\"\'‘’“”]'),          # quotes
    ('Fullstop', r'(\.+)$'),
    ('ellips', r'\.(\.)+'),
    ('HYPHEN', r'[-+\|+]'),
    ('Slashes', r'[\\\/]'),
    ('COMMA12', r'[,%]'),
    ('hin_stop', r'।'),
    ('quotes_question', r'[”\?]'),
    ('hashtag', r'#'),
    ('abbr', r'([\U00000900-\U0000097Fa-zA-Z]+\.)+')
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
get_token = re.compile(tok_regex)


def tokenize(list_s):
    tkns = []
    for wrds in list_s:
        wrds_len = len(wrds)
        initial_pos = 0
        end_pos = 0
        while initial_pos <= (wrds_len - 1):
            mo = get_token.match(wrds, initial_pos)
            if mo is not None and len(mo.group(0)) == wrds_len:
                tkns.append(wrds)
                initial_pos = wrds_len
            else:
                match_out = get_token.search(wrds, initial_pos)
                if match_out is not None:
                    end_pos = match_out.end()
                    if match_out.lastgroup == "NUMBER":
                        aa = wrds[initial_pos:(end_pos)]
                    elif match_out.lastgroup == "abbr":
                        if end_pos == len(wrds):
                            pass
                        else:
                            end_pos = wrds.rfind('.') + 1
                        aa = wrds[initial_pos: end_pos]
                    else:
                        aa = wrds[initial_pos:(end_pos - 1)]
                    if aa != '':
                        tkns.append(aa)
                    if match_out.lastgroup not in ["NUMBER", "abbr"]:
                        tkns.append(match_out.group(0))
                    initial_pos = end_pos
                else:
                    tkns.append(wrds[initial_pos:])
                    initial_pos = wrds_len
    return tkns


def read_file_and_tokenize(input_file, output_file):
    string_sentences = ''
    file_read = open(input_file, 'r', encoding='utf-8')
    text = file_read.read().strip()
    end_sentence_punctuations = ['?', '۔', '؟', '।', '!', '|']
    all_punctuations = '!"#$%&\'\(\)*+,\-/:;<=>?@[\\]^_`{|}~“”'
    quotes = '\'"“”`'
    # sentences = re.findall(
    #     ".*?[" + ''.join(end_sentence_punctuations) + "]+['\"']*|.*?\n", text + '\n', re.UNICODE)
    sentences = re.findall(
        ".*?[" + ''.join(end_sentence_punctuations) + "]+[" + quotes + "]?|.*?\n", text + '\n')
    count_sentence = 1
    for index, sentence in enumerate(sentences):
        sentence = sentence.strip()
        if sentence != '':
            if re.findall('[' + all_punctuations + ']', sentence) and len([token.strip() for token in re.findall('[' + all_punctuations + ']', sentence) if token.strip()]) == len(sentence):
                continue
            list_tokens = tokenize(sentence.split())
            string_sentences += '<Sentence id=\'' + \
                str(count_sentence) + '\'>\n'
            mapped_tokens = list(map(lambda token_index: str(
                token_index[0] + 1) + '\t' + token_index[1].strip() + '\tunk', list(enumerate(list_tokens))))
            if index + 1 <= len(sentences) - 1 and sentence[-1] in end_sentence_punctuations and re.findall('[' + all_punctuations + ']', sentences[index + 1]) and len([token.strip() for token in re.findall('[' + all_punctuations + ']', sentences[index + 1]) if token.strip()]) == len(sentence):
                lastIndex = int(mapped_tokens[-1].split('\t')[0])
                for indexTok, token in enumerate([token.strip() for token in re.split('([' + all_punctuations + '])', sentences[index + 1].strip()) if token.strip()]):
                    mapped_tokens.append(str(lastIndex + 1 + indexTok) + '\t' + token + '\tunk')
            string_sentences += '\n'.join(mapped_tokens) + \
                '\n</Sentence>\n\n'
            count_sentence += 1
    write_data_to_file(output_file, string_sentences)


def write_data_to_file(output_file, data):
    with open(output_file, 'w', encoding='utf-8') as file_write:
        file_write.write(data + '\n')
        file_write.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input', dest='inp', help="enter the input file path")
    parser.add_argument(
        '--output', dest='out', help="enter the output file path")
    args = parser.parse_args()
    read_file_and_tokenize(args.inp, args.out)