# Sentiment Analysis

In this notebook we anaylze data from IMDB about movie reviews, in order to build a model to predict whether any movie review is positive or negative.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
import re

# Check if CUDA is available
print(f"CUDA is available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")

CUDA is available: True
Current GPU: NVIDIA RTX A6000


Now we load the IMDb Dataset using Hugging Face datasets

In [2]:
# Load dataset
dataset = load_dataset("imdb", split="train")

# Convert to pandas DataFrame for easier handling
df = pd.DataFrame(dataset)
df['label'] = df['label'].apply(lambda x: 1 if x == 1 else 0)  # Ensure binary labels

# Take a subset for faster training
df = df.sample(n=5000, random_state=42)

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

print("\nSample reviews and labels:")
for i in range(3):
    print(f"\nReview: {train_df.iloc[i]['text'][:100]}...")
    print(f"Label: {train_df.iloc[i]['label']} ({'positive' if train_df.iloc[i]['label'] == 1 else 'negative'})")


Training samples: 4000
Test samples: 1000

Sample reviews and labels:

Review: It came by surprise. . .the impact & resoloution this film had on my automatic way of being reactive...
Label: 1 (positive)

Review: Normally, I am a pretty generous critic, but in the case of this film I have to say it was incredibl...
Label: 0 (negative)

Review: Last year we were treated to two movies about Truman Capote writing the book from which this film wa...
Label: 1 (positive)


In [3]:
#df

It is necessary to preprocess the text and build the vocabulary

In [4]:
def preprocess_text(text):
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text.split()

# Build vocabulary from training data
def build_vocabulary(texts, min_freq=5):
    word_freq = {}
    for text in texts:
        words = preprocess_text(text)
        for word in words:
            word_freq[word] = word_freq.get(word, 0) + 1
    
    # Filter by frequency and create vocabulary
    vocab = {'<pad>': 0, '<unk>': 1}
    idx = 2
    for word, freq in word_freq.items():
        if freq >= min_freq:
            vocab[word] = idx
            idx += 1
    
    return vocab

# Create vocabulary from training data
vocab = build_vocabulary(train_df['text'])
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 11023


In [5]:
# Create text vectorization functions and Dataset class

def text_to_vector(text, vocab, max_length=500):
    words = preprocess_text(text)
    vector = torch.zeros(len(vocab))
    for word in words[:max_length]:
        idx = vocab.get(word, vocab['<unk>'])
        vector[idx] += 1
    return vector

class SentimentDataset(Dataset):
    def __init__(self, df, vocab):
        self.texts = df['text'].values
        self.labels = torch.tensor(df['label'].values, dtype=torch.float32)
        self.vocab = vocab
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        vector = text_to_vector(text, self.vocab)
        return vector, self.labels[idx]

# Create datasets and dataloaders
train_dataset = SentimentDataset(train_df, vocab)
test_dataset = SentimentDataset(test_df, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [6]:
# Define neural network model

class SentimentNet(nn.Module):
    def __init__(self, vocab_size):
        super(SentimentNet, self).__init__()
        self.layer1 = nn.Linear(vocab_size, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.bn1 = nn.BatchNorm1d(256)
        self.bn2 = nn.BatchNorm1d(128)
        
    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.layer1(x))))
        x = self.dropout(self.relu(self.bn2(self.layer2(x))))
        x = torch.sigmoid(self.layer3(x))
        return x

# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentNet(len(vocab)).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:
# Training loop with progress tracking

num_epochs = 10
history = {'train_loss': [], 'train_acc': []}

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        predicted = (outputs.squeeze() > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = total_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    history['train_loss'].append(epoch_loss)
    history['train_acc'].append(epoch_acc)
    
    print(f'Epoch [{epoch+1}/{num_epochs}]')
    print(f'Loss: {epoch_loss:.4f}')
    print(f'Training Accuracy: {epoch_acc:.2f}%')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]
Loss: 0.4581
Training Accuracy: 77.92%
Epoch [2/10]
Loss: 0.1677
Training Accuracy: 94.95%
Epoch [3/10]
Loss: 0.0721
Training Accuracy: 98.03%
Epoch [4/10]
Loss: 0.0472
Training Accuracy: 98.65%
Epoch [5/10]
Loss: 0.0431
Training Accuracy: 98.35%
Epoch [6/10]
Loss: 0.0531
Training Accuracy: 98.10%
Epoch [7/10]
Loss: 0.0634
Training Accuracy: 97.72%
Epoch [8/10]
Loss: 0.0522
Training Accuracy: 98.03%
Epoch [9/10]
Loss: 0.0391
Training Accuracy: 98.70%
Epoch [10/10]
Loss: 0.0217
Training Accuracy: 99.20%


In [8]:
# Model evaluation

model.eval()
test_correct = 0
test_total = 0
predictions = []
true_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        predicted = (outputs.squeeze() > 0.5).float()
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

test_accuracy = 100 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.2f}%')


Test Accuracy: 81.90%


Now we can try the model with new reviews

In [9]:
def predict_sentiment(text, model, vocab):
    model.eval()
    with torch.no_grad():
        vector = text_to_vector(text, vocab).unsqueeze(0).to(device)
        prediction = model(vector)
        prob = prediction.item()
        return "Positive" if prob > 0.5 else "Negative", prob

# Test with new reviews
new_reviews = [
    "This film is a masterpiece of modern cinema. The acting and direction were superb.",
    "I was really disappointed with this movie. The plot made no sense and the acting was terrible.",
    "While not perfect, the movie manages to entertain and surprise throughout its runtime.",
    "The movie seems to be entertaining at first, but lacks in everything else.",
    "Interesting movie.",
    "Worst movie ever!",
    "Random sentence."
]

for review in new_reviews:
    sentiment, probability = predict_sentiment(review, model, vocab)
    print(f"Review: {review}")
    print(f"Sentiment: {sentiment} (confidence: {probability:.2%})\n")

Review: This film is a masterpiece of modern cinema. The acting and direction were superb.
Sentiment: Positive (confidence: 89.81%)

Review: I was really disappointed with this movie. The plot made no sense and the acting was terrible.
Sentiment: Negative (confidence: 1.19%)

Review: While not perfect, the movie manages to entertain and surprise throughout its runtime.
Sentiment: Positive (confidence: 94.64%)

Review: The movie seems to be entertaining at first, but lacks in everything else.
Sentiment: Negative (confidence: 34.52%)

Review: Interesting movie.
Sentiment: Positive (confidence: 68.40%)

Review: Worst movie ever!
Sentiment: Negative (confidence: 7.60%)

Review: Random sentence.
Sentiment: Positive (confidence: 69.59%)

