Commit 6400610e authored by Pruthwik's avatar Pruthwik

Programs for Annotation

parents
लाखों छात्रों का इंतजार खत्म हो गया है। उत्तर प्रदेश माध्यमिक शिक्षा परिषद की ओर से यूपी बोर्ड दसवीं और बारहवीं कक्षा के परिणाम की घोषणा आज 18 जून, 2022 को की जाएगी। इस परिणाम से जुड़े हर अपडेट को आप तक पहुंचाने के लिए हम लाए हैं यह लाइव ब्लॉग।
<Sentence id='1'>
1 लाखों unk
2 छात्रों unk
3 का unk
4 इंतजार unk
5 खत्म unk
6 हो unk
7 गया unk
8 है unk
9 । unk
</Sentence>
<Sentence id='2'>
1 उत्तर unk
2 प्रदेश unk
3 माध्यमिक unk
4 शिक्षा unk
5 परिषद unk
6 की unk
7 ओर unk
8 से unk
9 यूपी unk
10 बोर्ड unk
11 दसवीं unk
12 और unk
13 बारहवीं unk
14 कक्षा unk
15 के unk
16 परिणाम unk
17 की unk
18 घोषणा unk
19 आज unk
20 18 unk
21 जून unk
22 , unk
23 2022 unk
24 को unk
25 की unk
26 जाएगी unk
27 । unk
</Sentence>
<Sentence id='3'>
1 इस unk
2 परिणाम unk
3 से unk
4 जुड़े unk
5 हर unk
6 अपडेट unk
7 को unk
8 आप unk
9 तक unk
10 पहुंचाने unk
11 के unk
12 लिए unk
13 हम unk
14 लाए unk
15 हैं unk
16 यह unk
17 लाइव unk
18 ब्लॉग unk
19 । unk
</Sentence>
# how to run the code
# python tokenizer_for_file.py --input InputFileName --output OutputFileName
# Author Darshan and Pruthwik
import re
import argparse
token_specification = [
('datemonth',
r'^(0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])[-\/\.](1|2)\d\d\d$'),
('monthdate',
r'^(0?[1-9]|[12][0-9]|3[01])[-\/\.](0?[1-9]|1[012])[-\/\.](1|2)\d\d\d$'),
('yearmonth',
r'^((1|2)\d\d\d)[-\/\.](0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])'),
('EMAIL1', r'([\w\.])+@(\w)+\.(com|org|co\.in)$'),
('url1', r'(www\.)([-a-z0-9]+\.)*([-a-z0-9]+.*)(\/[-a-z0-9]+)*/i'),
('url', r'/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i'),
('BRACKET', r'[\(\)\[\]\{\}]'), # Brackets
('NUMBER', r'^(\d+)([,\.]\d+)*(\S+)*'), # Integer or decimal number
# ('NUMBER', r'^(\d+)([,\.]\d+)*(\S+)*'), # Integer or decimal number
('ASSIGN', r'[~:]'), # Assignment operator
('END', r'[;!_]'), # Statement terminator
('EQUAL', r'='), # Equals
('OP', r'[+*\/\-]'), # Arithmetic operators
('QUOTES', r'[\"\'‘’“”]'), # quotes
('Fullstop', r'(\.+)$'),
('ellips', r'\.(\.)+'),
('HYPHEN', r'[-+\|+]'),
('Slashes', r'[\\\/]'),
('COMMA12', r'[,%]'),
('hin_stop', r'।'),
('quotes_question', r'[”\?]'),
('hashtag', r'#'),
('abbr', r'([\U00000900-\U0000097Fa-zA-Z]+\.)+')
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
get_token = re.compile(tok_regex)
def tokenize(list_s):
tkns = []
for wrds in list_s:
wrds_len = len(wrds)
initial_pos = 0
end_pos = 0
while initial_pos <= (wrds_len - 1):
mo = get_token.match(wrds, initial_pos)
if mo is not None and len(mo.group(0)) == wrds_len:
tkns.append(wrds)
initial_pos = wrds_len
else:
match_out = get_token.search(wrds, initial_pos)
if match_out is not None:
end_pos = match_out.end()
if match_out.lastgroup == "NUMBER":
aa = wrds[initial_pos:(end_pos)]
elif match_out.lastgroup == "abbr":
if end_pos == len(wrds):
pass
else:
end_pos = wrds.rfind('.') + 1
aa = wrds[initial_pos: end_pos]
else:
aa = wrds[initial_pos:(end_pos - 1)]
if aa != '':
tkns.append(aa)
if match_out.lastgroup not in ["NUMBER", "abbr"]:
tkns.append(match_out.group(0))
initial_pos = end_pos
else:
tkns.append(wrds[initial_pos:])
initial_pos = wrds_len
return tkns
def read_file_and_tokenize(input_file, output_file):
string_sentences = ''
file_read = open(input_file, 'r', encoding='utf-8')
text = file_read.read().strip()
end_sentence_punctuations = ['?', '۔', '؟', '।', '!', '|']
all_punctuations = '!"#$%&\'\(\)*+,\-/:;<=>?@[\\]^_`{|}~“”'
quotes = '\'"“”`'
# sentences = re.findall(
# ".*?[" + ''.join(end_sentence_punctuations) + "]+['\"']*|.*?\n", text + '\n', re.UNICODE)
sentences = re.findall(
".*?[" + ''.join(end_sentence_punctuations) + "]+[" + quotes + "]?|.*?\n", text + '\n')
count_sentence = 1
for index, sentence in enumerate(sentences):
sentence = sentence.strip()
if sentence != '':
if re.findall('[' + all_punctuations + ']', sentence) and len([token.strip() for token in re.findall('[' + all_punctuations + ']', sentence) if token.strip()]) == len(sentence):
continue
list_tokens = tokenize(sentence.split())
string_sentences += '<Sentence id=\'' + \
str(count_sentence) + '\'>\n'
mapped_tokens = list(map(lambda token_index: str(
token_index[0] + 1) + '\t' + token_index[1].strip() + '\tunk', list(enumerate(list_tokens))))
if index + 1 <= len(sentences) - 1 and sentence[-1] in end_sentence_punctuations and re.findall('[' + all_punctuations + ']', sentences[index + 1]) and len([token.strip() for token in re.findall('[' + all_punctuations + ']', sentences[index + 1]) if token.strip()]) == len(sentence):
lastIndex = int(mapped_tokens[-1].split('\t')[0])
for indexTok, token in enumerate([token.strip() for token in re.split('([' + all_punctuations + '])', sentences[index + 1].strip()) if token.strip()]):
mapped_tokens.append(str(lastIndex + 1 + indexTok) + '\t' + token + '\tunk')
string_sentences += '\n'.join(mapped_tokens) + \
'\n</Sentence>\n\n'
count_sentence += 1
write_data_to_file(output_file, string_sentences)
def write_data_to_file(output_file, data):
with open(output_file, 'w', encoding='utf-8') as file_write:
file_write.write(data + '\n')
file_write.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--input', dest='inp', help="enter the input file path")
parser.add_argument(
'--output', dest='out', help="enter the output file path")
args = parser.parse_args()
read_file_and_tokenize(args.inp, args.out)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment