In [28]:
# Read book info
text_info = read_yaml('data/info.yaml')
author_corpus = defaultdict()
authors = text_info['authors']
# Create author corpus
for author in authors:
# Create corpara for each author
author_corpus[author] = defaultdict(list)
# Get each book from an author
for book_name in authors[author]:
# Read text from book
file_path = os.path.join(*['data', 'corpora', author, book_name + '.txt'])
text = read_txt(file_path=file_path)
tokens = tokenizer(text)
tokens_wosw = remove_stopwords(tokens)
# Accumulate author corpara
author_corpus[author]['tokens'].extend(tokens)
author_corpus[author]['tokens_wosw'].extend(tokens_wosw)
author_corpus[author]['tokens_per_book'].append(tokens)
author_corpus[author]['tokens_wosw_per_book'].append(tokens_wosw)
# Zipf's Law curve (linear scale)
plt.figure(figsize=(5, 3))
for i, author in enumerate(author_corpus):
word_freq = Counter(author_corpus[author]['tokens_wosw'])
word_freq_ranked = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))
zipf_curve = np.array(list(word_freq_ranked.values()))
x = np.arange(len(zipf_curve))+1
plt.plot(x, zipf_curve , label=author.title())
plt.legend()
plt.xlabel('Rank')
plt.ylabel('Frequency')
plt.grid()
plt.title("Zipf's Law curve (linear scale)")
plt.show()
In [26]:
# Zipf's Law curve (log scale)
f, axes = plt.subplots(1,3, figsize=(14,3.5))
f.suptitle("Zipf's Law curve (log scale)")
for i, author in enumerate(author_corpus):
tokens_per_book = author_corpus[author]['tokens_per_book']
axes[i].set_title(author.capitalize())
for k, tokens in enumerate(tokens_per_book):
word_freq = Counter(tokens)
word_freq_ranked = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))
zipf_curve = np.array(list(word_freq_ranked.values()))
x = np.log(np.arange(len(zipf_curve))+1)
y = np.log(zipf_curve)
axes[i].scatter(x, y, s=1, label='Book %d' % (k+1), color=colors[k])
axes[i].legend()
axes[i].set_xlabel('log(rank)')
axes[i].set_ylabel('log(frequency)')
axes[i].grid()
plt.tight_layout()