Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
P
Programs_For_Annotation
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Packages & Registries
Packages & Registries
Package Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
pruthwik mishra
Programs_For_Annotation
Commits
6400610e
Commit
6400610e
authored
Jun 18, 2022
by
Pruthwik
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Programs for Annotation
parents
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
187 additions
and
0 deletions
+187
-0
hindi_sentences_raw.txt
hindi_sentences_raw.txt
+1
-0
hindi_sentences_tokenized.txt
hindi_sentences_tokenized.txt
+65
-0
tokenizer_for_file.py
tokenizer_for_file.py
+121
-0
No files found.
hindi_sentences_raw.txt
0 → 100644
View file @
6400610e
लाखों छात्रों का इंतजार खत्म हो गया है। उत्तर प्रदेश माध्यमिक शिक्षा परिषद की ओर से यूपी बोर्ड दसवीं और बारहवीं कक्षा के परिणाम की घोषणा आज 18 जून, 2022 को की जाएगी। इस परिणाम से जुड़े हर अपडेट को आप तक पहुंचाने के लिए हम लाए हैं यह लाइव ब्लॉग।
hindi_sentences_tokenized.txt
0 → 100644
View file @
6400610e
<Sentence id='1'>
1 लाखों unk
2 छात्रों unk
3 का unk
4 इंतजार unk
5 खत्म unk
6 हो unk
7 गया unk
8 है unk
9 । unk
</Sentence>
<Sentence id='2'>
1 उत्तर unk
2 प्रदेश unk
3 माध्यमिक unk
4 शिक्षा unk
5 परिषद unk
6 की unk
7 ओर unk
8 से unk
9 यूपी unk
10 बोर्ड unk
11 दसवीं unk
12 और unk
13 बारहवीं unk
14 कक्षा unk
15 के unk
16 परिणाम unk
17 की unk
18 घोषणा unk
19 आज unk
20 18 unk
21 जून unk
22 , unk
23 2022 unk
24 को unk
25 की unk
26 जाएगी unk
27 । unk
</Sentence>
<Sentence id='3'>
1 इस unk
2 परिणाम unk
3 से unk
4 जुड़े unk
5 हर unk
6 अपडेट unk
7 को unk
8 आप unk
9 तक unk
10 पहुंचाने unk
11 के unk
12 लिए unk
13 हम unk
14 लाए unk
15 हैं unk
16 यह unk
17 लाइव unk
18 ब्लॉग unk
19 । unk
</Sentence>
tokenizer_for_file.py
0 → 100644
View file @
6400610e
# how to run the code
# python tokenizer_for_file.py --input InputFileName --output OutputFileName
# Author Darshan and Pruthwik
import
re
import
argparse
token_specification
=
[
(
'datemonth'
,
r'^(0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])[-\/\.](1|2)\d\d\d$'
),
(
'monthdate'
,
r'^(0?[1-9]|[12][0-9]|3[01])[-\/\.](0?[1-9]|1[012])[-\/\.](1|2)\d\d\d$'
),
(
'yearmonth'
,
r'^((1|2)\d\d\d)[-\/\.](0?[1-9]|1[012])[-\/\.](0?[1-9]|[12][0-9]|3[01])'
),
(
'EMAIL1'
,
r'([\w\.])+@(\w)+\.(com|org|co\.in)$'
),
(
'url1'
,
r'(www\.)([-a-z0-9]+\.)*([-a-z0-9]+.*)(\/[-a-z0-9]+)*/i'
),
(
'url'
,
r'/((?:https?\:\/\/|www\.)(?:[-a-z0-9]+\.)*[-a-z0-9]+.*)/i'
),
(
'BRACKET'
,
r'[\(\)\[\]\{\}]'
),
# Brackets
(
'NUMBER'
,
r'^(\d+)([,\.]\d+)*(\S+)*'
),
# Integer or decimal number
# ('NUMBER', r'^(\d+)([,\.]\d+)*(\S+)*'), # Integer or decimal number
(
'ASSIGN'
,
r'[~:]'
),
# Assignment operator
(
'END'
,
r'[;!_]'
),
# Statement terminator
(
'EQUAL'
,
r'='
),
# Equals
(
'OP'
,
r'[+*\/\-]'
),
# Arithmetic operators
(
'QUOTES'
,
r'[\"\'‘’“”]'
),
# quotes
(
'Fullstop'
,
r'(\.+)$'
),
(
'ellips'
,
r'\.(\.)+'
),
(
'HYPHEN'
,
r'[-+\|+]'
),
(
'Slashes'
,
r'[\\\/]'
),
(
'COMMA12'
,
r'[,%]'
),
(
'hin_stop'
,
r'।'
),
(
'quotes_question'
,
r'[”\?]'
),
(
'hashtag'
,
r'#'
),
(
'abbr'
,
r'([\U00000900-\U0000097Fa-zA-Z]+\.)+'
)
]
tok_regex
=
'|'
.
join
(
'(?P<%s>%s)'
%
pair
for
pair
in
token_specification
)
get_token
=
re
.
compile
(
tok_regex
)
def
tokenize
(
list_s
):
tkns
=
[]
for
wrds
in
list_s
:
wrds_len
=
len
(
wrds
)
initial_pos
=
0
end_pos
=
0
while
initial_pos
<=
(
wrds_len
-
1
):
mo
=
get_token
.
match
(
wrds
,
initial_pos
)
if
mo
is
not
None
and
len
(
mo
.
group
(
0
))
==
wrds_len
:
tkns
.
append
(
wrds
)
initial_pos
=
wrds_len
else
:
match_out
=
get_token
.
search
(
wrds
,
initial_pos
)
if
match_out
is
not
None
:
end_pos
=
match_out
.
end
()
if
match_out
.
lastgroup
==
"NUMBER"
:
aa
=
wrds
[
initial_pos
:(
end_pos
)]
elif
match_out
.
lastgroup
==
"abbr"
:
if
end_pos
==
len
(
wrds
):
pass
else
:
end_pos
=
wrds
.
rfind
(
'.'
)
+
1
aa
=
wrds
[
initial_pos
:
end_pos
]
else
:
aa
=
wrds
[
initial_pos
:(
end_pos
-
1
)]
if
aa
!=
''
:
tkns
.
append
(
aa
)
if
match_out
.
lastgroup
not
in
[
"NUMBER"
,
"abbr"
]:
tkns
.
append
(
match_out
.
group
(
0
))
initial_pos
=
end_pos
else
:
tkns
.
append
(
wrds
[
initial_pos
:])
initial_pos
=
wrds_len
return
tkns
def
read_file_and_tokenize
(
input_file
,
output_file
):
string_sentences
=
''
file_read
=
open
(
input_file
,
'r'
,
encoding
=
'utf-8'
)
text
=
file_read
.
read
().
strip
()
end_sentence_punctuations
=
[
'?'
,
'۔'
,
'؟'
,
'।'
,
'!'
,
'|'
]
all_punctuations
=
'!"#$%&
\'
\(\)*+,\-/:;<=>?@[
\\
]^_`{|}~“”'
quotes
=
'
\'
"“”`'
# sentences = re.findall(
# ".*?[" + ''.join(end_sentence_punctuations) + "]+['\"']*|.*?\n", text + '\n', re.UNICODE)
sentences
=
re
.
findall
(
".*?["
+
''
.
join
(
end_sentence_punctuations
)
+
"]+["
+
quotes
+
"]?|.*?
\n
"
,
text
+
'
\n
'
)
count_sentence
=
1
for
index
,
sentence
in
enumerate
(
sentences
):
sentence
=
sentence
.
strip
()
if
sentence
!=
''
:
if
re
.
findall
(
'['
+
all_punctuations
+
']'
,
sentence
)
and
len
([
token
.
strip
()
for
token
in
re
.
findall
(
'['
+
all_punctuations
+
']'
,
sentence
)
if
token
.
strip
()])
==
len
(
sentence
):
continue
list_tokens
=
tokenize
(
sentence
.
split
())
string_sentences
+=
'<Sentence id=
\'
'
+
\
str
(
count_sentence
)
+
'
\'
>
\n
'
mapped_tokens
=
list
(
map
(
lambda
token_index
:
str
(
token_index
[
0
]
+
1
)
+
'
\t
'
+
token_index
[
1
].
strip
()
+
'
\t
unk'
,
list
(
enumerate
(
list_tokens
))))
if
index
+
1
<=
len
(
sentences
)
-
1
and
sentence
[
-
1
]
in
end_sentence_punctuations
and
re
.
findall
(
'['
+
all_punctuations
+
']'
,
sentences
[
index
+
1
])
and
len
([
token
.
strip
()
for
token
in
re
.
findall
(
'['
+
all_punctuations
+
']'
,
sentences
[
index
+
1
])
if
token
.
strip
()])
==
len
(
sentence
):
lastIndex
=
int
(
mapped_tokens
[
-
1
].
split
(
'
\t
'
)[
0
])
for
indexTok
,
token
in
enumerate
([
token
.
strip
()
for
token
in
re
.
split
(
'(['
+
all_punctuations
+
'])'
,
sentences
[
index
+
1
].
strip
())
if
token
.
strip
()]):
mapped_tokens
.
append
(
str
(
lastIndex
+
1
+
indexTok
)
+
'
\t
'
+
token
+
'
\t
unk'
)
string_sentences
+=
'
\n
'
.
join
(
mapped_tokens
)
+
\
'
\n
</Sentence>
\n\n
'
count_sentence
+=
1
write_data_to_file
(
output_file
,
string_sentences
)
def
write_data_to_file
(
output_file
,
data
):
with
open
(
output_file
,
'w'
,
encoding
=
'utf-8'
)
as
file_write
:
file_write
.
write
(
data
+
'
\n
'
)
file_write
.
close
()
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--input'
,
dest
=
'inp'
,
help
=
"enter the input file path"
)
parser
.
add_argument
(
'--output'
,
dest
=
'out'
,
help
=
"enter the output file path"
)
args
=
parser
.
parse_args
()
read_file_and_tokenize
(
args
.
inp
,
args
.
out
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment