# An Introduction to Natural Language Processing using NLTK

Import NLTK and download required resources.

In [None]:
!pip install -qq svgling
import nltk

nltk.download("book", quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('maxent_ne_chunker_tab', quiet=True)


## Misc

### Stop Words

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
print(stop_words)

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

print("went :", lemmatizer.lemmatize("went", wordnet.VERB))
print("better (adjective):", lemmatizer.lemmatize("better", wordnet.ADJ))
print("better (adverb):", lemmatizer.lemmatize("better", wordnet.ADV))
print("corpora :", lemmatizer.lemmatize("corpora"))

def lemmatize_text(text: str):
  def to_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
      return wordnet.ADJ
    elif nltk_tag.startswith('V'):
      return wordnet.VERB
    elif nltk_tag.startswith('N'):
      return wordnet.NOUN
    elif nltk_tag.startswith('R'):
      return wordnet.ADV
    else:
      return wordnet.NOUN # Default value to avoid errors
  words = nltk.word_tokenize(text)
  tags = nltk.pos_tag(words)
  for word, pos in tags:
    yield lemmatizer.lemmatize(word, pos=to_wordnet_pos(pos))

print(" ".join(lemmatize_text("He sings better than before.")))
print(" ".join(lemmatize_text("They better leave now.")))

## Tokenization, POS, Entities
Take a sentence and tokenize into words. Then apply a part-of-speech tagger.

In [None]:
sentence = """At eight o'clock on Thursday morning Arthur Mills didn't feel very good."""

tokens = nltk.word_tokenize(sentence)

print(tokens)

tagged = nltk.pos_tag(tokens)

# Display it in a nice tree
nltk.chunk.ne_chunk(tagged)


## Concordance

In [None]:
%matplotlib inline

from nltk.book import *

Generate a key-word in context concordance

In [None]:
text1.concordance("monstrous")

Find words with similar concordance to a given word

In [None]:
print(text1)
text1.similar("monstrous")
print(text2)
text2.similar("monstrous")


Find contexts which are similar for the given words

In [None]:
text2.common_contexts(["monstrous", "very"])

Plot where in the text certain words appear

In [None]:
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America", "and"])

Print the identity of a text, the length of the text and its vocabulary

In [None]:
print(text3)
print(len(text3))
print(sorted(set(text3)))

Print some statistics of word occurrence in the text

In [None]:
def lexical_diversity(text):
  return len(set(text)) / len(text)
def percentage(count, total):
  return 100 * count / total

print(lexical_diversity(text3))
print(lexical_diversity(text5))
print(percentage(text4.count('a'), len(text4)))


# NLTK in Action

In the following, we will create a classifier that should tell us whether a movie review is positive or bad ("sentiment analysis").

## Preparing the Dataset

To do so, we will be using the IMDB movie review corpus, which we can fetch through `nltk.corpus.movie_reviews.`:

In [None]:
import nltk
from nltk.corpus import movie_reviews
import random
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

nltk.download('movie_reviews', quiet=True)

print(f"Classes represented in the movie reviews: {movie_reviews.categories()}")
print(f"#Documents: {sum(len(movie_reviews.fileids(cat)) for cat in movie_reviews.categories())}")
for cat in movie_reviews.categories():
  print(f"#Documents that are {cat}: {len(movie_reviews.fileids(cat))}")

# Write the dataset into a list[tuple[list[str], str]], where the first element
# of the tuple is the document's text and the second element is its label (pos
# or neg)
documents = [(movie_reviews.words(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

print(f"Average number of words per review: {(sum(len(w) for w,_ in documents))/len(documents)}")
print(f"Average number of words per positive review: {(sum(len(w) for w,c in documents if c=='pos'))/len(movie_reviews.fileids('pos'))}")
print(f"Average number of words per negative review: {(sum(len(w) for w,c in documents if c=='neg'))/len(movie_reviews.fileids('neg'))}")
## Draw histograms
def render(dist: list, label: str, ax):
  df = pd.DataFrame(dist)
  df.plot.hist(bins=25, density=True, edgecolor='w', linewidth=0.5, ax=ax, alpha=0.4)
  df.plot.density(color='k', alpha=0.5, ax=ax)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 4))
ax.set_xlim((0, 4000))
render((len(w) for w,c in documents if c=='pos'), "positive", ax)
render((len(w) for w,c in documents if c=='neg'), "negative", ax)
plt.ylabel("Density")
plt.xticks(np.arange(0, 4000, step=500))
plt.xlabel("Wordcount")
ax.legend(labels=['Positive', '', 'Negative', ''], frameon=False)
plt.savefig('score-density.pdf')
plt.show()
###


# Features
# There are many ways how we could represent the text. For now, we will choose
# to represent it using a BOW model of the 2000 most common words in the dataset
stop_words = set(nltk.corpus.stopwords.words('english'))

def normalize_words(words):
  for w in words:
    yield w.lower()

all_words = nltk.FreqDist(normalize_words(movie_reviews.words()))
common_words, _ = zip(*all_words.most_common(2000))
print(f"The most common words are: {' '.join(common_words[:10])} ...")

# Now that we have the most common words, we represent a document as a 2000
# dimensional boolean vector.
def doc_features(document: list[str]) -> dict[str, bool]:
    docwords = set(document)
    return np.fromiter(((word in docwords) for word in common_words), dtype=np.float32)

dataset = [(doc_features(d), c) for (d, c) in documents]
print(dataset[0])

# Finally, we can split our dataset into training and testing splits
train_set, test_set = train_test_split(dataset, test_size=0.25, random_state=42)

X_train, y_train = zip(*train_set)
X_test, y_test = zip(*test_set)

## Training Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train logistic regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
coefs = clf.coef_[0]  # for binary classification

def visualize_word_contributions(text_words):
    # Create features for the input
    X = doc_features(text_words).reshape(1, -1)

    # Get prediction
    prob = clf.predict_proba(X)[0]
    pred = clf.predict(X)[0]

    # Show contributions
    contributions = [(word, coefs[i]) for i, word in enumerate(common_words) if word in text_words]

    # Sort by magnitude
    contributions.sort(key=lambda x: abs(x[1]), reverse=True)
    contributions = contributions[:20]

    # Plot
    words, weights = zip(*contributions)
    colors = ['green' if w > 0 else 'red' for w in weights]

    plt.figure(figsize=(10, 5))
    plt.barh(words, weights, color=colors)
    plt.xlabel("Contribution to Positive Sentiment")
    plt.title(f"Prediction: {pred} (prob pos: {prob[1]:.2f})")
    plt.gca().invert_yaxis()
    plt.show()

# Example usage
example_text = list(movie_reviews.words(movie_reviews.fileids('pos')[0]))
print(" ".join(example_text))
visualize_word_contributions(example_text)


In [None]:
example_text = list(normalize_words(nltk.word_tokenize("The movie is bad")))
print(example_text)
print(doc_features(example_text))
visualize_word_contributions(example_text)

In [None]:
example_text = list(normalize_words(nltk.word_tokenize("The movie is good")))
print(example_text)
print(doc_features(example_text))
visualize_word_contributions(example_text)

In [None]:
example_text = list(normalize_words(nltk.word_tokenize("The movie could not have been better")))
print(example_text)
print(doc_features(example_text))
visualize_word_contributions(example_text)

**Questions**
- How could the effectiveness be improved further?


**See Also:**
- [Sentiment Flow â€“ A General Model of Web Review Argumentation](https://downloads.webis.de/publications/papers/wachsmuth_2015a.pdf); Wachsmuth et al. 2015