In [28]:
# Read book info
text_info = read_yaml('data/info.yaml')
author_corpus = defaultdict()
authors = text_info['authors']

# Create author corpus
for author in authors:
    # Create corpara for each author
    author_corpus[author] = defaultdict(list)
    
    # Get each book from an author
    for book_name in authors[author]:
        # Read text from book
        file_path = os.path.join(*['data', 'corpora', author, book_name + '.txt'])
        text = read_txt(file_path=file_path)
        tokens = tokenizer(text)
        tokens_wosw = remove_stopwords(tokens)

        # Accumulate author corpara
        author_corpus[author]['tokens'].extend(tokens)
        author_corpus[author]['tokens_wosw'].extend(tokens_wosw)
        author_corpus[author]['tokens_per_book'].append(tokens)
        author_corpus[author]['tokens_wosw_per_book'].append(tokens_wosw)


# Zipf's Law curve (linear scale)
plt.figure(figsize=(5, 3))
for i, author in enumerate(author_corpus):
    word_freq = Counter(author_corpus[author]['tokens_wosw'])
    word_freq_ranked = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))
    zipf_curve = np.array(list(word_freq_ranked.values()))
    x = np.arange(len(zipf_curve))+1
    
    plt.plot(x, zipf_curve , label=author.title())
    plt.legend()
    plt.xlabel('Rank')
    plt.ylabel('Frequency')
    plt.grid()

plt.title("Zipf's Law curve (linear scale)")
plt.show()
No description has been provided for this image
In [26]:
# Zipf's Law curve (log scale)
f, axes = plt.subplots(1,3, figsize=(14,3.5))
f.suptitle("Zipf's Law curve (log scale)")

for i, author in enumerate(author_corpus):
    tokens_per_book = author_corpus[author]['tokens_per_book']
    axes[i].set_title(author.capitalize())
    
    for k, tokens in enumerate(tokens_per_book):
        word_freq = Counter(tokens)
        word_freq_ranked = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))
        zipf_curve = np.array(list(word_freq_ranked.values()))
        x = np.log(np.arange(len(zipf_curve))+1)
        y = np.log(zipf_curve)

        axes[i].scatter(x, y, s=1, label='Book %d' % (k+1), color=colors[k])
        axes[i].legend()
        axes[i].set_xlabel('log(rank)')
        axes[i].set_ylabel('log(frequency)')
        axes[i].grid()

plt.tight_layout()
No description has been provided for this image