Source code for enlp.visualisation.freq_distribution

"""
Contains functions for visualisation of text
"""

import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud


[docs]def wordcloud_plot(token_list, colormap='rainbow', ax=None): """ Word cloud of tokens Parameters ---------- token_list : :obj:`list` list of tokens, can be obtained from nlp_tools.to_list() colormap : :obj:`str` colormap object, see predifened colormaps at https://matplotlib.org/examples/color/colormaps_reference.html [optional, default = 'rainbow'] ax : :obj:`matplotlib.axes.Axis` Axis object for figure Returns ------- fig : :obj:`matplotlib.figure.Figure` Figure object line plot of word counts ax : :obj:`matplotlib.axes.Axis` Axis object for figure """ if ax is None: fig, ax = plt.subplots(1, 1, figsize=(10, 15)) else: fig = None wordcloud = WordCloud(width=1500, height=1200, margin=0, colormap=colormap, collocations=False).generate(' '.join(token_list)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis("off") return fig, ax
[docs]def dist_plot_detailed(word_counts, log=False, ax=None): """ Frequency count of tokens in a SMALL Corpus (<50 unique tokens) with words on the x-axis Parameters ---------- word_counts : :obj:`list` sorted list of words and their respective frequency, can be obtained from nlp_distributions.count() log : :obj:`bool` make y axis logarithmic [optional, default = False] ax : :obj:`matplotlib.axes.Axis` Axis object for figure Returns ------- fig : :obj:`matplotlib.figure.Figure` Figure object line plot of word counts ax : :obj:`matplotlib.axes.Axis` Axis object for figure Notes ----- If there are more than 50 unique tokens then the plot will be too busy and may crash your machine while computing """ if ax is None: fig, ax = plt.subplots(1, 1, figsize=(10, 15)) else: fig = None words = [str(w[0]) for w in word_counts] counts = np.asarray([w[1] for w in word_counts]) # print (len(words)) # print (len(counts)) if log: ax.semilogy(range(len(words)),counts) ax.set_title('Log. Word Frequencies') ax.set_ylabel('Log. word count') else: ax.plot(range(len(words)),counts) ax.set_title('Word Frequencies') ax.set_ylabel('Word count') ax.grid(True) ax.set_xticks(range(len(words))) ax.set_xticklabels(words, rotation=90) ax.set_xlabel('Words') return fig, ax
[docs]def dist_plot(word_counts, log=False, shade_singles=True, shade_top25=True, ax=None): """ Frequency count of tokens Parameters ---------- word_counts : :obj:`list` sorted list of words and their respective frequency, can be obtained from nlp_distributions.count() log : :obj:`bool` make y axis logarithmic [optional, default = False] shade_singles : :obj:`bool` shade area on graph where token counts are 1 [optional, default = True] shade_top25 : :obj:`bool` shade area on graph where tokens count for top quarter of tokens [optional, default = True] ax : :obj:`matplotlib.axes.Axis` Axis object for figure Returns ------- fig : :obj:`matplotlib.figure.Figure` Figure object line plot of word counts ax : :obj:`matplotlib.axes.Axis` Axis object for figure """ if ax is None: fig, ax = plt.subplots(1, 1, figsize=(10, 15)) else: fig = None counts = np.asarray([w[1] for w in word_counts]) # ax.set_xticklabels(ax.get_xticklabels(True), rotation=90) ax.set_xlabel('Index of word') ax.grid(True) if shade_singles: ax.axvspan(min(np.argwhere(counts == 1.)), max(np.argwhere(counts == 1.)), alpha=0.5, color='lightblue', label='single-occurance') ax.legend() if shade_top25: num_words = sum(counts) words_cum_sum = np.cumsum(counts) top25 = num_words / 4. # last_top25_ind = max(np.argwhere(words_cum_sum <= top25)) ax.axvspan(min(np.argwhere(words_cum_sum <= top25)), max(np.argwhere(words_cum_sum <= top25)), alpha=0.5, color='red', label='25% of occurances') ax.legend() if log: ax.semilogy(counts, color='k') ax.set_title('Log. Word Frequencies') ax.set_ylabel('Log. word count') else: ax.plot(counts, color='k') ax.set_title('Word Frequencies') ax.set_ylabel('Word count') return fig, ax