accessory_functions.py

#!/usr/bin/env python
## Accessory functions for NLP

import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

### Set these according to your setup ###
nltk_path = 'data/'
google_vec_file = nltk_path + 'GoogleNews-vectors-negative300.bin.gz'

### The NLTK corpora were downloaded with the following commands ###
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


import pprint, pickle


def get_wordnet_pos(treebank_tag):
    """Convert the part-of-speech naming scheme
       from the nltk default to that which is
       recognized by the WordNet lemmatizer"""

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
def preprocess_series_text(data)
    """Perform complete preprocessing on a Pandas series
       including removal of alpha numerical words, normalization,
       punctuation removal, tokenization, stop word removal, 
       and lemmatization."""
    
    # setup nltk path
    """
    if nltk_path:
        nltk.data.path.insert(0, nltk_path)
    """

    # remove alpha numerical words and make lowercase
    alphanum_re = re.compile(r"""\w*\d\w*""")
    alphanum_lambda = lambda x: alphanum_re.sub('', x.strip().lower())

    data = data.map(alphanum_lambda)

    # remove punctuation
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    punc_lambda = lambda x: punc_re.sub(' ', x)

    data = data.map(punc_lambda)

    # tokenize words
    data = data.map(word_tokenize)

    # remove stop words
    sw = stopwords.words('english')
    sw_lambda = lambda x: list(filter(lambda y: y not in sw, x))

    data = data.map(sw_lambda)

    # part of speech tagging--must convert to format used by lemmatizer
    data = data.map(nltk.pos_tag)
    pos_lambda = lambda x: [(y[0], get_wordnet_pos(y[1])) for y in x]
    data = data.map(pos_lambda)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lem_lambda = lambda x: [lemmatizer.lemmatize(*y) for y in x]
    data = data.map(lem_lambda)
    
    return data.map(' '.join)


def read_pickle_file(fileName):
  """
  @fileName: pickle file saved in local disk
  @return serialzed object
  """
  pkl_file = open(fileName, 'rb')
  data = pickle.load(pkl_file)
  pkl_file.close()
  return data

def get_score(lda_model, doc2vec):
    """
    lda_model: LDA/LSA model 
    
    """
    for index, score in sorted(lda_model[doc2vec], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\nTopic: {} \nWord: {}".format(score, index, lda_model.print_topic(index, 10)))

def write_pickle_file(data, fileName):
  """
  @data: serialized object
  fileName: pickle file name
  """
  output = open(fileName, 'wb')
  pickle.dump(data, output)
  output.close()