Source code for enlp.understanding.distributions

"""
Contains functions for visualisation of text
"""

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


[docs]def freq_dist(tokens): """ Count frequency of tokens Parameters ---------- tokens : :obj:`list` list of tokens to be analysed, note these may include punctuation Returns ------- count : :obj:`list` sorted list of words and their respective frequency, i.e. list of tuples (word,count) Notes ----- If words are originally in string format use stdtools.tokenise() to convert to input format Examples -------- >>> words=['aa','sd','re','aa','er','hg','sd','le','ot','tr','tr'] >>> print(freq_dist(words)[:5]) # top 5 words [('aa', 2), ('sd', 2), ('tr', 2), ('re', 1), ('er', 1)] """ fdist = nltk.FreqDist(tokens) count = fdist.most_common(len(tokens)) return count
[docs]def compute_tfidf(text_list, doc_ids=None): """ Compute tfidf Parameters ---------- text_list : :obj:`list` list of texts (documents) doc_ids : :obj:`list` list of document ids for indexing results Returns ------- scores : :obj:`pandas.DataFrame` pandas dataframe where every word is a feature and every document is an observation Notes ----- For a large corpus or a large number of documents it is better to use the scikit-learn transformer directly to take advantage of the sparse matrix procedures """ vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(text_list) words = vectorizer.get_feature_names() doc_dicts = [] for i, doc_scores in enumerate(X): doc_dict = dict() for w_i, w_v in enumerate(doc_scores.toarray()[0]): doc_dict.update({words[w_i]: w_v}) doc_dicts.append(doc_dict) scores = pd.DataFrame(doc_dicts) if doc_ids: scores.index = doc_ids return scores
[docs]def important_words_per_corpus(scores, n=10): """ Based on tfidf scores, return most important words per corpus Parameters ---------- scores : :obj:`pandas.DataFrame` pandas dataframe where every word is a feature and every document is an observation, computed by compute_tfidf method n : :obj:`int` number of important words to return Returns ------- imp_words : :obj:`list` list of tuples of important word and their average tfidf score across the corpus """ imp_words = scores.mean().sort_values(ascending=False)[:20] return imp_words
[docs]def important_words_per_doc(scores, doc_id=None, n=5): """ Based on tfidf scores, return most important words per document Parameters ---------- scores : :obj:`pandas.DataFrame` pandas dataframe where every word is a feature and every document is an observation, computed by compute_tfidf method doc_ids : :obj:`list` list of document ids for indexing results, default is to compute for all documents Returns ------- imp_words : :obj:`list` list of doc lists where doc list contains tuples of important word and its score in the document """ if not doc_id: doc_id = list(scores.index) imp_words = [] for d_id in doc_id: #print('Document: %s' % str(d_id)) doc_scores = scores.loc[d_id].sort_values(ascending=False) d_imp_words = [] for k, v in doc_scores[:n].iteritems(): d_imp_words.append((k, v) ) imp_words.append(d_imp_words) return imp_words