In [ ]:
def tokenizer(text):
tokens = []
for line in text:
words = line.split(' ')
for word in words:
# Get rid of characters except alphabetic
word = re.sub("[^A-Za-z]", "", word)
if len(word)>1:
tokens.append(word.lower())
return tokens
def remove_stopwords(input_words):
stopword_list = tokenizer(read_txt('data/stop_words.txt'))
output_words = [word for word in input_words if word not in stopword_list]
return output_words