In [33]:
# Type-token relation (TTR)
for author in author_corpus:
tokens = author_corpus[author]['tokens']
type_size = np.zeros(len(tokens))
types = []
n_type = 0
for i, token in enumerate(tokens):
# Get unique tokens (types)
if token not in types:
types.append(token)
n_type += 1
type_size[i] = n_type
# Add type size info to corresponding author corpora
author_corpus[author]['type_size'] = type_size
# Plot type-token size relationship in linear and log scale
f, axes = plt.subplots(1,2, figsize=(10,3))
for i, author in enumerate(author_corpus):
type_size = author_corpus[author]['type_size']
axes[0].plot(type_size , label=author.title(), color=colors[i])
axes[0].legend()
axes[0].set_title('TTR (linear scale)')
axes[0].set_xlabel('Token size')
axes[0].set_ylabel('Type size')
axes[0].grid()
# Log-scale
for i, author in enumerate(author_corpus):
type_size = author_corpus[author]['type_size']
x = np.log(np.arange(len(type_size))+1)
axes[1].plot(x, np.log(type_size), label=author.title(), color=colors[i])
axes[1].legend()
axes[1].set_title('TTR (log scale)')
axes[1].set_xlabel('log(token size)')
axes[1].set_ylabel('log(type size)')
axes[1].grid()
In [40]:
# Type-token relation for each book
for author in author_corpus:
tokens_per_book = author_corpus[author]['tokens_per_book']
# Pre-allocation for type size for each book
author_corpus[author]['type_size_per_book'] = [None]*len(tokens_per_book)
for book_idx, tokens in enumerate(tokens_per_book):
type_size = np.zeros(len(tokens))
types = []
# Moving window in tokens
n_type = 0
for i, token in enumerate(tokens):
if token not in types:
types.append(token)
n_type += 1
type_size[i] = n_type
# Add type size info to corresponding author corpora
author_corpus[author]['type_size_per_book'][book_idx] = type_size
# Plot type-token size relationship per book (log scale)
f, axes = plt.subplots(1,3, figsize=(16,3))
for i, author in enumerate(author_corpus):
type_sizes = author_corpus[author]['type_size_per_book']
axes[i].set_title(author.capitalize())
for k, type_size in enumerate(type_sizes):
x1 = np.log(np.arange(len(type_size))+1)
axes[i].plot(x1, np.log(type_size), label='Book %d' % (k+1), color=colors[k])
axes[i].legend()
axes[i].set_xlabel('log(token size)')
axes[i].set_ylabel('log(type size)')
axes[i].grid()