Programs for Annotation

6400610e · Pruthwik · 6400610e · 6400610e · 6400610e
Commit 6400610e authored Jun 18, 2022 by Pruthwik
Showing with 187 additions and 0 deletions

hindi_sentences_raw.txt hindi_sentences_raw.txt +1 -0

hindi_sentences_tokenized.txt hindi_sentences_tokenized.txt +65 -0

tokenizer_for_file.py tokenizer_for_file.py +121 -0

No files found.
--- a/hindi_sentences_raw.txt
+++ b/hindi_sentences_raw.txt
+ लाखों छात्रों का इंतजार खत्म हो गया है। उत्तर प्रदेश माध्यमिक शिक्षा परिषद की ओर से यूपी बोर्ड दसवीं और बारहवीं कक्षा के परिणाम की घोषणा आज 18 जून, 2022 को की जाएगी। इस परिणाम से जुड़े हर अपडेट को आप तक पहुंचाने के लिए हम लाए हैं यह लाइव ब्लॉग। 
--- a/hindi_sentences_tokenized.txt
+++ b/hindi_sentences_tokenized.txt
+<Sentence id='1'>
+1	लाखों	unk
+2	छात्रों	unk
+3	का	unk
+4	इंतजार	unk
+5	खत्म	unk
+6	हो	unk
+7	गया	unk
+8	है	unk
+9	।	unk
+</Sentence>
+
+<Sentence id='2'>
+1	उत्तर	unk
+2	प्रदेश	unk
+3	माध्यमिक	unk
+4	शिक्षा	unk
+5	परिषद	unk
+6	की	unk
+7	ओर	unk
+8	से	unk
+9	यूपी	unk
+10	बोर्ड	unk
+11	दसवीं	unk
+12	और	unk
+13	बारहवीं	unk
+14	कक्षा	unk
+15	के	unk
+16	परिणाम	unk
+17	की	unk
+18	घोषणा	unk
+19	आज	unk
+20	18	unk
+21	जून	unk
+22	,	unk
+23	2022	unk
+24	को	unk
+25	की	unk
+26	जाएगी	unk
+27	।	unk
+</Sentence>
+
+<Sentence id='3'>
+1	इस	unk
+2	परिणाम	unk
+3	से	unk
+4	जुड़े	unk
+5	हर	unk
+6	अपडेट	unk
+7	को	unk
+8	आप	unk
+9	तक	unk
+10	पहुंचाने	unk
+11	के	unk
+12	लिए	unk
+13	हम	unk
+14	लाए	unk
+15	हैं	unk
+16	यह	unk
+17	लाइव	unk
+18	ब्लॉग	unk
+19	।	unk
+</Sentence>
+
+
--- a/tokenizer_for_file.py
+++ b/tokenizer_for_file.py
+# how to run the code
+# python tokenizer_for_file.py --input InputFileName --output OutputFileName
+# Author Darshan and Pruthwik
+import re
+import argparse
+
+
+token_specification = [
+    ('datemonth',
+     r'^(0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])[-\/\.](1|2)\d\d\d$'),
+    ('monthdate',
+     r'^(0?[1-9]|[12][0-9]|3[01])[-\/\.](0?[1-9]|1[012])[-\/\.](1|2)\d\d\d$'),
+    ('yearmonth',
+     r'^((1|2)\d\d\d)[-\/\.](0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])'),
+    ('EMAIL1', r'([\w\.])+@(\w)+\.(com|org|co\.in)$'),
+    ('url1', r'(www\.)([-a-z0-9]+\.)*([-a-z0-9]+.*)(\/[-a-z0-9]+)*/i'),
+    ('url', r'/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i'),
+    ('BRACKET', r'[\(\)\[\]\{\}]'),       # Brackets
+    ('NUMBER', r'^(\d+)([,\.]\d+)*(\S+)*'),  # Integer or decimal number
+    # ('NUMBER', r'^(\d+)([,\.]\d+)*(\S+)*'),  # Integer or decimal number
+    ('ASSIGN', r'[~:]'),          # Assignment operator
+    ('END', r'[;!_]'),           # Statement terminator
+    ('EQUAL', r'='),   # Equals
+    ('OP', r'[+*\/\-]'),    # Arithmetic operators
+    ('QUOTES', r'[\"\'‘’“”]'),          # quotes
+    ('Fullstop', r'(\.+)$'),
+    ('ellips', r'\.(\.)+'),
+    ('HYPHEN', r'[-+\|+]'),
+    ('Slashes', r'[\\\/]'),
+    ('COMMA12', r'[,%]'),
+    ('hin_stop', r'।'),
+    ('quotes_question', r'[”\?]'),
+    ('hashtag', r'#'),
+    ('abbr', r'([\U00000900-\U0000097Fa-zA-Z]+\.)+')
+]
+tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
+get_token = re.compile(tok_regex)
+
+
+def tokenize(list_s):
+    tkns = []
+    for wrds in list_s:
+        wrds_len = len(wrds)
+        initial_pos = 0
+        end_pos = 0
+        while initial_pos <= (wrds_len - 1):
+            mo = get_token.match(wrds, initial_pos)
+            if mo is not None and len(mo.group(0)) == wrds_len:
+                tkns.append(wrds)
+                initial_pos = wrds_len
+            else:
+                match_out = get_token.search(wrds, initial_pos)
+                if match_out is not None:
+                    end_pos = match_out.end()
+                    if match_out.lastgroup == "NUMBER":
+                        aa = wrds[initial_pos:(end_pos)]
+                    elif match_out.lastgroup == "abbr":
+                        if end_pos == len(wrds):
+                            pass
+                        else:
+                            end_pos = wrds.rfind('.') + 1
+                        aa = wrds[initial_pos: end_pos]
+                    else:
+                        aa = wrds[initial_pos:(end_pos - 1)]
+                    if aa != '':
+                        tkns.append(aa)
+                    if match_out.lastgroup not in ["NUMBER", "abbr"]:
+                        tkns.append(match_out.group(0))
+                    initial_pos = end_pos
+                else:
+                    tkns.append(wrds[initial_pos:])
+                    initial_pos = wrds_len
+    return tkns
+
+
+def read_file_and_tokenize(input_file, output_file):
+    string_sentences = ''
+    file_read = open(input_file, 'r', encoding='utf-8')
+    text = file_read.read().strip()
+    end_sentence_punctuations = ['?', '۔', '؟', '।', '!', '|']
+    all_punctuations = '!"#$%&\'\(\)*+,\-/:;<=>?@[\\]^_`{|}~“”'
+    quotes = '\'"“”`'
+    # sentences = re.findall(
+    #     ".*?[" + ''.join(end_sentence_punctuations) + "]+['\"']*|.*?\n", text + '\n', re.UNICODE)
+    sentences = re.findall(
+        ".*?[" + ''.join(end_sentence_punctuations) + "]+[" + quotes + "]?|.*?\n", text + '\n')
+    count_sentence = 1
+    for index, sentence in enumerate(sentences):
+        sentence = sentence.strip()
+        if sentence != '':
+            if re.findall('[' + all_punctuations + ']', sentence) and len([token.strip() for token in re.findall('[' + all_punctuations + ']', sentence) if token.strip()]) == len(sentence):
+                continue
+            list_tokens = tokenize(sentence.split())
+            string_sentences += '<Sentence id=\'' + \
+                str(count_sentence) + '\'>\n'
+            mapped_tokens = list(map(lambda token_index: str(
+                token_index[0] + 1) + '\t' + token_index[1].strip() + '\tunk', list(enumerate(list_tokens))))
+            if index + 1 <= len(sentences) - 1 and sentence[-1] in end_sentence_punctuations and re.findall('[' + all_punctuations + ']', sentences[index + 1]) and len([token.strip() for token in re.findall('[' + all_punctuations + ']', sentences[index + 1]) if token.strip()]) == len(sentence):
+                lastIndex = int(mapped_tokens[-1].split('\t')[0])
+                for indexTok, token in enumerate([token.strip() for token in re.split('([' + all_punctuations + '])', sentences[index + 1].strip()) if token.strip()]):
+                    mapped_tokens.append(str(lastIndex + 1 + indexTok) + '\t' + token + '\tunk')
+            string_sentences += '\n'.join(mapped_tokens) + \
+                '\n</Sentence>\n\n'
+            count_sentence += 1
+    write_data_to_file(output_file, string_sentences)
+
+
+def write_data_to_file(output_file, data):
+    with open(output_file, 'w', encoding='utf-8') as file_write:
+        file_write.write(data + '\n')
+        file_write.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input', dest='inp', help="enter the input file path")
+    parser.add_argument(
+        '--output', dest='out', help="enter the output file path")
+    args = parser.parse_args()
+    read_file_and_tokenize(args.inp, args.out)