"""
Contains functions for natural language processing
"""
import string
from nltk.stem.snowball import SnowballStemmer
[docs]def get_stopwords():
"""Get list of Norwegian and English stopwords.
Get stopwords list from Spacy for Norwegian and English, returning each list separately as well as a combined list
Returns
-------
stopwords : :obj:`list`
lists of combination of Norwegian and English stopwords, Norwegian stopwords, English stopwords
stops_nb : :obj:`list`
lists of Norwegian stopwords
stops_en : :obj:`list`
lists of English stopwords
Notes
-----
This has assumed both norwegian and english language models have been downloaded
Examples
--------
>>> stopwords_all, stopwords_norwegian, stopwords_english = get_stopwords()
>>> print(stopwords[:5])
['therein', 'neither', 'indeed', 'whereby', 'yourself']
"""
from spacy.lang.en.stop_words import STOP_WORDS
stops_en = list(STOP_WORDS)
from spacy.lang.nb.stop_words import STOP_WORDS
stops_nb = list(STOP_WORDS)
stopwords = stops_en + stops_nb
stopwords = [str(i) for i in stopwords]
return stopwords, stops_nb, stops_en
[docs]def rm_stopwords(model, text, stopwords):
"""Remove stopwords from string.
Parameters
----------
model : :obj:`spacy.lang`
SpaCy language model
text : :obj:`str`
text string on which to remove stopwords
stopwords : :obj:`list`
list of stopwords to remove
Returns
-------
updated_text : :obj:`str`
Updated version of input string with stopwords (and possibly punctuation) removed
Notes
-----
String output is to allow piping between functions to return words as a list use: tokenise(rm_stopwords(...))
Examples
--------
>>> import spacy
>>> lang_mod = spacy.load('nb_dep_ud_sm')
>>> text = 'Den raske brune reven hoppet over den late hunden.'
>>> stopwords_all, stopwords_norwegian, stopwords_english = get_stopwords()
>>> print (rm_stopwords(lang_mod, text, stopwords_all))
raske brune reven hoppet late hunden.
"""
# Remove stopwords
tokens = [t.string.strip() for t in model(text) if t.string.lower().strip() not in stopwords]
# Join string back together
updated_text = ' '.join(tokens)
# If not removing punctuation that retain original spacing around punctuation, i.e. 'end.' not 'end .'
#updated_text = ' '.join(updated_text.split())
updated_text = retain_spaces(updated_text)
return updated_text
[docs]def rm_punctuation(model, text):
"""Return string free of punctuation
Parameters
----------
model : :obj:`spacy.lang`
SpaCy language model
text : :obj:`str`
text string on which to remove stopwords
Returns
-------
updated_text : :obj:`str`
Updated version of input string where punctuation has been removed
Notes
-----
String output is to allow piping between functions to return words as a list use: to_list(remove_punctuation(...))
Examples
--------
>>> import spacy
>>> lang_mod = spacy.load('en_core_web_md')
>>> text = 'I better have passed that test - it is 90 percent of the class grade.'
>>> print (rm_punctuation(lang_mod, text))
I better have passed that test it is 90 percent of the class grade
"""
tokens = [t.string.strip() for t in model(text) if t.string.lower().strip() not in string.punctuation]
updated_text = ' '.join(tokens)
return updated_text
[docs]def spacy_lemmatize(model, text):
"""Return string of lemmatized text
Lemmatization is the process of reducing the different forms of a word to one single form, for example,
reducing "builds", "building", or "built" to the lemma "build"
Parameters
----------
model : :obj:`spacy.lang`
SpaCy language model
text : :obj:`str`
text string on which to remove stopwords
Returns
-------
updated_text : :obj:`str`
Updated version of input string where words have been lemmatized
Notes
-----
String output is to allow piping between functions to return words as a list use: to_list(lemmatize(...))
Examples
--------
>>> import spacy
>>> lang_mod = spacy.load('nb_dep_ud_sm')
>>> text = 'Den raske brune reven hoppet over den late hunden.'
>>> print (spacy_lemmatize(lang_mod,text))
'den rask brun rev hoppe over den lat hund.'
"""
lemma_tx = [t.text if t.lemma_ == '-PRON-' else t.lemma_
for t in model(text)
]
updated_text = ' '.join(lemma_tx)
updated_text = retain_spaces(updated_text)
return updated_text
[docs]def nltk_stem_no(model, text):
"""Return string of stemmed text using NLTK's Norwegian snowball stemmer
Stemming is a technique to remove affixes from a word, ending up with the stem.
For example, the stem of cooking is cook.
Parameters
----------
model : :obj:`spacy.lang`
SpaCy language model
text : :obj:`str`
text string on which to remove stopwords
Returns
-------
updated_text : :obj:`str`
Updated version of input string where words have been stemmed
Notes
-----
String output is to allow piping between functions to return words as a list use: to_list(stem_norwegian(...))
Examples
--------
>>> import spacy
>>> lang_mod = spacy.load('nb_dep_ud_sm')
>>> text = 'Den raske brune reven hoppet over den late hunden.'
>>> print (nltk_stem_no(lang_mod,text))
den rask brun rev hopp over den lat hund.
"""
stemmer = SnowballStemmer("norwegian")
stemmed_words = [stemmer.stem(t.string.strip()) for t in model(text)]
updated_text = ' '.join(stemmed_words)
updated_text = retain_spaces(updated_text)
return updated_text
[docs]def tokenise(model,text):
"""Return list of tokens for a piece of text.
A token is a string of contiguous characters between two spaces, or between a space and punctuation marks.
A token can also be an integer, real, or a number with a colon (time, for example: 2:00).
All other symbols are tokens themselves except apostrophes and quotation marks in a word (with no space),
which in many cases symbolize acronyms or citations.
Parameters
----------
model : :obj:`spacy.lang`
SpaCy language model
text : :obj:`str`
text string on which to remove stopwords
Returns
-------
tokens : :obj:`list`
List of tokens, list is ordered as tokens appear in sentence.
Examples
--------
>>> import spacy
>>> lang_mod = spacy.load('nb_dep_ud_sm')
>>> text = 'Den raske brune reven hoppet over den late hunden.'
>>> print (tokenise(lang_mod,text))
['Den', 'raske', 'brune', 'reven', 'hoppet', 'over', 'den', 'late', 'hunden', '.']
"""
tokens = [t.string.strip() for t in model(text)]
return tokens
[docs]def retain_spaces(processed):
"""Retaining spaces around punctuation at the end of a sentence
Function for use when joining tokens and wishing to retain original spacing around punctuation.
without function - lemma = 'the quick brown fox jump over the lazy dog .'
with function - lemma = 'the quick brown fox jump over the lazy dog.'
Parameters
----------
processed : :obj:`str`
processed text string
Returns
-------
updated_text : :obj:`str`
updated processed sentence to ensure same spacing around symbols as in original
Notes
-----
Have only accounted for punctuation at the end of a sentence and not others, for example % or $ or # etc.
Examples
--------
>>> tokens = ['Den', 'raske', 'brune', 'reven', 'hoppet', 'over', 'den', 'late', 'hunden', '.']
>>> joined_tokens = ' '.join(tokens)
>>> print ('Original: ', joined_tokens)
>>> print ('Fixed spaces: ', retain_spaces(joined_tokens))
Original: Den raske brune reven hoppet over den late hunden .
Fixed spaces: Den raske brune reven hoppet over den late hunden.
"""
i_to_rm = []
char_list = [s for s in processed]
for i, char in enumerate(processed):
if i > 0 and char in ['.', '?', '!'] and processed[i - 1] == ' ':
i_to_rm.append(i - 1)
for index in sorted(i_to_rm, reverse=True):
del char_list[index]
# make new string removing blanks
updated_text = ''.join(char_list)
return updated_text