In [33]:
# Type-token relation (TTR)
for author in author_corpus:
        tokens = author_corpus[author]['tokens']
        type_size = np.zeros(len(tokens))
        types = []
        n_type = 0

        for i, token in enumerate(tokens):
            # Get unique tokens (types)
            if token not in types:
                types.append(token)
                n_type += 1

            type_size[i] = n_type
            
        # Add type size info to corresponding author corpora
        author_corpus[author]['type_size'] = type_size


# Plot type-token size relationship in linear and log scale
f, axes = plt.subplots(1,2, figsize=(10,3))

for i, author in enumerate(author_corpus):
    type_size = author_corpus[author]['type_size']
    axes[0].plot(type_size , label=author.title(), color=colors[i])

axes[0].legend()
axes[0].set_title('TTR (linear scale)')
axes[0].set_xlabel('Token size')
axes[0].set_ylabel('Type size')
axes[0].grid()

# Log-scale
for i, author in enumerate(author_corpus):
    type_size = author_corpus[author]['type_size']
    x = np.log(np.arange(len(type_size))+1)
    axes[1].plot(x, np.log(type_size), label=author.title(), color=colors[i])

axes[1].legend()
axes[1].set_title('TTR (log scale)')
axes[1].set_xlabel('log(token size)')
axes[1].set_ylabel('log(type size)')
axes[1].grid()
No description has been provided for this image
In [40]:
# Type-token relation for each book
for author in author_corpus:
    tokens_per_book = author_corpus[author]['tokens_per_book']
    # Pre-allocation for type size for each book
    author_corpus[author]['type_size_per_book'] = [None]*len(tokens_per_book)
    for book_idx, tokens in enumerate(tokens_per_book):
        type_size = np.zeros(len(tokens))
        types = []
        
        # Moving window in tokens
        n_type = 0
        for i, token in enumerate(tokens):
            if token not in types:
                types.append(token)
                n_type += 1

            type_size[i] = n_type

        # Add type size info to corresponding author corpora
        author_corpus[author]['type_size_per_book'][book_idx] = type_size


# Plot type-token size relationship per book (log scale)
f, axes = plt.subplots(1,3, figsize=(16,3))
for i, author in enumerate(author_corpus):
    type_sizes = author_corpus[author]['type_size_per_book']
    axes[i].set_title(author.capitalize())
    
    for k, type_size in enumerate(type_sizes):
        x1 = np.log(np.arange(len(type_size))+1)
        axes[i].plot(x1, np.log(type_size), label='Book %d' % (k+1), color=colors[k])

    axes[i].legend()
    axes[i].set_xlabel('log(token size)')
    axes[i].set_ylabel('log(type size)')
    axes[i].grid()
No description has been provided for this image