I am trying to take a text file with messages and iterate each word through NLTK wordnet synset function. I want to do this because I want to create a list of mispelled words. For example if I do:
wn.synsets('dog')
I get output:
[Synset('dog.n.01'),
Synset('frump.n.01'),
Synset('dog.n.03'),
Synset('cad.n.01'),
Synset('frank.n.02'),
Synset('pawl.n.01'),
Synset('andiron.n.01'),
Synset('chase.v.01')]
now if the word is mispelled like so:
wn.synsets('doeg')
I get output:
[]
If I am returned an empty list I want to save the misspelled word in another list like so and while continuing to iterate through rest of the file:
mispelled_words = ['doeg']
I am at a loss how to do this, here is my code below, I would need to do the iterating after variable "chat_message_tokenize". The name path is words I want to drop:
import nltk
import csv
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem.snowball import SnowballStemmer
def text_function():
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
# Read in chat messages and names files
chat_path = 'filepath.csv'
try:
with open(chat_path) as infile:
chat_messages = infile.read()
except Exception as error:
print(error)
return
name_path = 'filepath.txt'
try:
with open(names_path) as infile:
names = infile.read()
except Exception as error:
print(error)
return
chat_messages = chat_messages.split('Chats:')[1].strip()
names = names.split('Name:')[1].strip().lower()
chat_messages_tokenized = nltk.word_tokenize(chat_messages)
names_tokenized = nltk.word_tokenize(names)
# adding part of speech(pos) tag and dropping proper nouns
pos_drop = pos_tag(chat_messages_tokenized)
chat_messages_tokenized = [SnowballStemmer('english').stem(word.lower()) for word, pos in pos_drop if pos != 'NNP' and word not in names_tokenized]
for chat_messages_tokenized
if not wn.synset(chat_messages_tokenized):
print('empty list')
if __name__ == '__main__':
text_function()
# for s in wn.synsets('dog'):
# lemmas = s.lemmas()
# for l in lemmas:
# if l.name() == stemmer:
# print (l.synset())
csv_path ='OutputFilePath.csv'
try:
with open(csv_path, 'w') as outfile:
writer = csv.writer(outfile)
for word in chat_messages_tokenized:
writer.writerow([word])
except Exception as error:
print(error)
return
if __name__ == '__main__':
text_function()
Thank you in advance.