tokenize.py 690 Bytes
Newer Older
priyank's avatar
priyank committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
import os, sys, codecs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created by

@author: priyank
'''

def tokenizer(text, ind):
	"""Tokenize the text only on space."""
	tokens = text.split()
	tokens_ssf = [str(index + 1) + '\t' + token + '\tunk' for index, token in enumerate(tokens)]
	tokens_ssf_with_sentence = ['<Sentence id="'+str(ind+1)+'">'] + tokens_ssf + ['</Sentence>']
	return '\n'.join(tokens_ssf_with_sentence)

f = codecs.open(sys.argv[1], "rb", "utf-8")
lines = f.readlines()
f.close()

finalOutput = ""
ii = 0
for line in lines:
	line = line.strip()
	if line:
		finalOutput = finalOutput + tokenizer(line, (ii)) + "\n" 
		ii = ii + 1
print (finalOutput.encode('utf-8'))