import os, sys, codecs #!/usr/bin/env python # -*- coding: utf-8 -*- ''' Created by @author: priyank ''' def tokenizer(text, ind): """Tokenize the text only on space.""" tokens = text.split() tokens_ssf = [str(index + 1) + '\t' + token + '\tunk' for index, token in enumerate(tokens)] tokens_ssf_with_sentence = [''] + tokens_ssf + [''] return '\n'.join(tokens_ssf_with_sentence) f = codecs.open(sys.argv[1], "rb", "utf-8") lines = f.readlines() f.close() finalOutput = "" ii = 0 for line in lines: line = line.strip() if line: finalOutput = finalOutput + tokenizer(line, (ii)) + "\n" ii = ii + 1 print (finalOutput.encode('utf-8'))