import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix,
    ConfusionMatrixDisplay, f1_score, accuracy_score
)

# Set visual style
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.family'] = 'sans-serif'
sns.set_style('whitegrid')

# Capstone color palette
HERO_GOLD = '#C9963A'
HERO_DEEP = '#8C6B22'
CONTEXT_DARK = '#3A352D'
CONTEXT_MID = '#8A8276'
CONTEXT_LIGHT = '#D8D2C4'

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print('Libraries loaded successfully.')

Libraries loaded successfully.

# Load dataset
df = pd.read_excel('data202_lab3_data.xlsx')
df.columns = df.columns.str.strip()

# Define all 24 VIA character strengths
strength_names = [
    'Appreciation of Beauty & Excellence', 'Bravery', 'Love', 'Prudence', 'Teamwork',
    'Creativity', 'Curiosity', 'Fairness', 'Forgiveness', 'Gratitude',
    'Honesty', 'Hope', 'Humor', 'Perseverance', 'Judgment',
    'Kindness', 'Leadership', 'Love of Learning', 'Humility', 'Perspective',
    'Self-Regulation', 'Social Intelligence', 'Spirituality', 'Zest'
]

# Create Race_Group variable from VIA racial codes
race_mapping = {
    '2332': 'Monoracial White',
    '2333': 'Monoracial Black',
    '2332, 2333': 'Black-White Biracial'
}
df['Race_Group'] = df['Race'].astype(str).map(race_mapping)

# Filter to target groups and convert strengths to numeric
ml_df = df[df['Race_Group'].notna()].copy()
for col in strength_names:
    ml_df[col] = pd.to_numeric(ml_df[col], errors='coerce')

# Drop rows with missing strength values
ml_df = ml_df.dropna(subset=strength_names)

print(f'Dataset shape: {ml_df.shape}')
print(f'Features: {len(strength_names)} character strengths')
print(f'\nClass distribution:')
print(ml_df['Race_Group'].value_counts())
print(f'\nClass percentages:')
print(round(ml_df['Race_Group'].value_counts(normalize=True) * 100, 1))

/opt/conda/envs/anaconda-2022.05-py39/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:211: UserWarning: Cell BO2819 is marked as a date but the serial value 3141984 is outside the limits for dates. The cell will be treated as an error.
  warn(msg)

Dataset shape: (7047, 81)
Features: 24 character strengths

Class distribution:
Black-White Biracial    3738
Monoracial White        2868
Monoracial Black         441
Name: Race_Group, dtype: int64

Class percentages:
Black-White Biracial    53.0
Monoracial White        40.7
Monoracial Black         6.3
Name: Race_Group, dtype: float64

print(df['Race_Group'].value_counts())

Black-White Biracial    3738
Monoracial White        2868
Monoracial Black         441
Name: Race_Group, dtype: int64

# Visualize class imbalance
class_counts = ml_df['Race_Group'].value_counts()
colors = [CONTEXT_LIGHT, HERO_GOLD, CONTEXT_MID]  # BWB, MonoBlack, MonoWhite

fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(class_counts.index, class_counts.values, color=colors, edgecolor='none')

# Add count labels on bars
for bar, count in zip(bars, class_counts.values):
    pct = count / class_counts.sum() * 100
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
            f'n={count}\n({pct:.1f}%)', ha='center', va='bottom',
            fontsize=11, color=CONTEXT_DARK)

ax.set_ylabel('Number of Participants', fontsize=12)
ax.set_title('Class Distribution: Why Imbalance Handling is Required',
             fontsize=13, fontweight='bold', color=CONTEXT_DARK)
ax.set_ylim(0, max(class_counts.values) * 1.2)
sns.despine()
plt.tight_layout()
plt.show()

imbalance_ratio = class_counts.max() / class_counts.min()
print(f'Imbalance ratio (largest / smallest): {imbalance_ratio:.1f} : 1')

Imbalance ratio (largest / smallest): 8.5 : 1

# Separate features and target
X = ml_df[strength_names].values
y = ml_df['Race_Group'].values

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print('Encoded classes:', dict(zip(le.classes_, le.transform(le.classes_))))

# 80/20 stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=RANDOM_STATE, stratify=y_encoded
)

print(f'\nTraining set: {X_train.shape[0]} observations')
print(f'Test set: {X_test.shape[0]} observations')

print(f'\nTraining distribution:')
for cls, name in enumerate(le.classes_):
    count = np.sum(y_train == cls)
    print(f'  {name}: {count} ({round(count/len(y_train)*100, 1)}%)')

print(f'\nTest distribution:')
for cls, name in enumerate(le.classes_):
    count = np.sum(y_test == cls)
    print(f'  {name}: {count} ({round(count/len(y_test)*100, 1)}%)')

Encoded classes: {'Black-White Biracial': 0, 'Monoracial Black': 1, 'Monoracial White': 2}

Training set: 5637 observations
Test set: 1410 observations

Training distribution:
  Black-White Biracial: 2990 (53.0%)
  Monoracial Black: 353 (6.3%)
  Monoracial White: 2294 (40.7%)

Test distribution:
  Black-White Biracial: 748 (53.0%)
  Monoracial Black: 88 (6.2%)
  Monoracial White: 574 (40.7%)

# Scale features: fit on training data, transform both sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Features standardized using StandardScaler (Raschka ch. 4).')
print(f'Training mean (should be ~0): {X_train_scaled.mean():.6f}')
print(f'Training std (should be ~1): {X_train_scaled.std():.6f}')

Features standardized using StandardScaler (Raschka ch. 4).
Training mean (should be ~0): 0.000000
Training std (should be ~1): 1.000000

# Multinomial logistic regression with class_weight='balanced'
logit_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced',
    random_state=RANDOM_STATE
)
logit_model.fit(X_train_scaled, y_train)

# Predict on test set
logit_preds = logit_model.predict(X_test_scaled)

print('Logistic Regression trained successfully.')
print(f'  class_weight: balanced')
print(f'  Training accuracy: {logit_model.score(X_train_scaled, y_train):.4f}')
print(f'  Test accuracy: {accuracy_score(y_test, logit_preds):.4f}')

Logistic Regression trained successfully.
  class_weight: balanced
  Training accuracy: 0.4527
  Test accuracy: 0.4340

# Random Forest with class_weight='balanced'
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)

# Predict on test set
rf_preds = rf_model.predict(X_test_scaled)

print('Random Forest trained successfully.')
print(f'  class_weight: balanced')
print(f'  n_estimators: 500')
print(f'  max_depth: 15')
print(f'  max_features: sqrt(24) ≈ 5')
print(f'  Training accuracy: {rf_model.score(X_train_scaled, y_train):.4f}')
print(f'  Test accuracy: {accuracy_score(y_test, rf_preds):.4f}')

Random Forest trained successfully.
  class_weight: balanced
  n_estimators: 500
  max_depth: 15
  max_features: sqrt(24) ≈ 5
  Training accuracy: 0.9980
  Test accuracy: 0.5738

print('=' * 60)
print('MULTINOMIAL LOGISTIC REGRESSION — TEST SET EVALUATION')
print('=' * 60)
print()
print(classification_report(y_test, logit_preds, target_names=le.classes_))

# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
cm_logit = confusion_matrix(y_test, logit_preds)
disp = ConfusionMatrixDisplay(cm_logit, display_labels=le.classes_)
disp.plot(ax=ax, cmap='YlOrBr', values_format='d')
ax.set_title('Confusion Matrix: Logistic Regression',
             fontsize=14, fontweight='bold', color=CONTEXT_DARK)
plt.xticks(rotation=15, ha='right')
plt.tight_layout()
plt.show()

============================================================
MULTINOMIAL LOGISTIC REGRESSION — TEST SET EVALUATION
============================================================

                      precision    recall  f1-score   support

Black-White Biracial       0.62      0.39      0.48       748
    Monoracial Black       0.15      0.68      0.25        88
    Monoracial White       0.48      0.45      0.47       574

            accuracy                           0.43      1410
           macro avg       0.42      0.51      0.40      1410
        weighted avg       0.53      0.43      0.46      1410

print('=' * 60)
print('RANDOM FOREST — TEST SET EVALUATION')
print('=' * 60)
print()
print(classification_report(y_test, rf_preds, target_names=le.classes_))

# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
cm_rf = confusion_matrix(y_test, rf_preds)
disp = ConfusionMatrixDisplay(cm_rf, display_labels=le.classes_)
disp.plot(ax=ax, cmap='YlOrBr', values_format='d')
ax.set_title('Confusion Matrix: Random Forest',
             fontsize=14, fontweight='bold', color=CONTEXT_DARK)
plt.xticks(rotation=15, ha='right')
plt.tight_layout()
plt.show()

============================================================
RANDOM FOREST — TEST SET EVALUATION
============================================================

                      precision    recall  f1-score   support

Black-White Biracial       0.59      0.74      0.66       748
    Monoracial Black       0.33      0.05      0.08        88
    Monoracial White       0.54      0.44      0.49       574

            accuracy                           0.57      1410
           macro avg       0.49      0.41      0.41      1410
        weighted avg       0.56      0.57      0.55      1410

print('=' * 60)
print('MODEL COMPARISON')
print('=' * 60)

logit_acc = accuracy_score(y_test, logit_preds)
rf_acc = accuracy_score(y_test, rf_preds)

logit_f1 = f1_score(y_test, logit_preds, average=None, labels=range(len(le.classes_)))
rf_f1 = f1_score(y_test, rf_preds, average=None, labels=range(len(le.classes_)))

print(f'\nOverall Accuracy:')
print(f'  Logistic Regression: {logit_acc:.4f}')
print(f'  Random Forest:       {rf_acc:.4f}')

print(f'\nPer-Class F1 Scores:')
comparison = pd.DataFrame({
    'Class': le.classes_,
    'Logistic_F1': [round(f, 4) for f in logit_f1],
    'RF_F1': [round(f, 4) for f in rf_f1]
})
print(comparison.to_string(index=False))

winner = 'Random Forest' if rf_acc > logit_acc else 'Logistic Regression'
print(f'\nBetter performing model: {winner}')

============================================================
MODEL COMPARISON
============================================================

Overall Accuracy:
  Logistic Regression: 0.4340
  Random Forest:       0.5738

Per-Class F1 Scores:
               Class  Logistic_F1  RF_F1
Black-White Biracial       0.4800 0.6571
    Monoracial Black       0.2454 0.0800
    Monoracial White       0.4665 0.4871

Better performing model: Random Forest

# 5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

logit_cv = cross_val_score(
    LogisticRegression(multi_class='multinomial', solver='lbfgs',
                       max_iter=1000, class_weight='balanced',
                       random_state=RANDOM_STATE),
    X_train_scaled, y_train, cv=cv, scoring='f1_weighted'
)

rf_cv = cross_val_score(
    RandomForestClassifier(n_estimators=500, max_depth=15,
                           class_weight='balanced',
                           random_state=RANDOM_STATE, n_jobs=-1),
    X_train_scaled, y_train, cv=cv, scoring='f1_weighted'
)

print('=== 5-Fold Stratified Cross-Validation (Weighted F1) ===')
print(f'\nLogistic Regression: {logit_cv.mean():.4f} (+/- {logit_cv.std():.4f})')
print(f'  Fold scores: {[round(s, 4) for s in logit_cv]}')
print(f'\nRandom Forest:       {rf_cv.mean():.4f} (+/- {rf_cv.std():.4f})')
print(f'  Fold scores: {[round(s, 4) for s in rf_cv]}')

if rf_cv.std() < logit_cv.std():
    print(f'\nRandom Forest shows lower variance across folds, suggesting more consistent performance.')
else:
    print(f'\nLogistic Regression shows lower variance across folds, suggesting more consistent performance.')

=== 5-Fold Stratified Cross-Validation (Weighted F1) ===

Logistic Regression: 0.4786 (+/- 0.0115)
  Fold scores: [4.865e-01, 0.4859, 0.4561, 0.4843, 0.4801]

Random Forest:       0.5447 (+/- 0.0200)
  Fold scores: [0.5327, 0.5286, 0.5388, 0.5839, 0.5394]

Logistic Regression shows lower variance across folds, suggesting more consistent performance.

# Extract feature importance from Random Forest
importance_df = pd.DataFrame({
    'Strength': strength_names,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=True)

# Color: hero gold for Spirituality, Gratitude, Hope; context gray for others
constellation_strengths = ['Spirituality', 'Gratitude', 'Hope']
colors = [
    HERO_GOLD if s in constellation_strengths else CONTEXT_MID
    for s in importance_df['Strength']
]

# Visualization
fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(importance_df['Strength'], importance_df['Importance'],
        color=colors, edgecolor='none')
ax.set_xlabel('Feature Importance (Gini Impurity)', fontsize=12, color=CONTEXT_DARK)
ax.set_title('Which Character Strengths Predict Racial Group?',
             fontsize=14, fontweight='bold', color=CONTEXT_DARK)
ax.tick_params(colors=CONTEXT_DARK)

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor=HERO_GOLD, label='Monoracial Black Constellation (unique to top 10)'),
    Patch(facecolor=CONTEXT_MID, label='Other strengths')
]
ax.legend(handles=legend_elements, loc='lower right', fontsize=10)

plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight', facecolor='white')
plt.show()

# Print rankings
print('\n=== TOP 10 MOST PREDICTIVE STRENGTHS ===')
top10 = importance_df.sort_values('Importance', ascending=False).head(10)
for i, (_, row) in enumerate(top10.iterrows(), 1):
    marker = ' ◆' if row['Strength'] in constellation_strengths else ''
    print(f"  {i}. {row['Strength']}: {row['Importance']:.4f}{marker}")

print('\n=== CONSTELLATION STRENGTH RANKINGS ===')
ranked = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)
for s in constellation_strengths:
    rank = ranked[ranked['Strength'] == s].index[0] + 1
    imp = ranked[ranked['Strength'] == s]['Importance'].values[0]
    print(f'  {s}: Rank {rank} (Importance = {imp:.4f})')

=== TOP 10 MOST PREDICTIVE STRENGTHS ===
  1. Spirituality: 0.0711 ◆
  2. Gratitude: 0.0524 ◆
  3. Humility: 0.0459
  4. Creativity: 0.0440
  5. Perseverance: 0.0439
  6. Appreciation of Beauty & Excellence: 0.0427
  7. Perspective: 0.0426
  8. Self-Regulation: 0.0420
  9. Prudence: 0.0413
  10. Forgiveness: 0.0406

=== CONSTELLATION STRENGTH RANKINGS ===
  Spirituality: Rank 1 (Importance = 0.0711)
  Gratitude: Rank 2 (Importance = 0.0524)
  Hope: Rank 15 (Importance = 0.0388)

Machine Learning & Predictive Modeling¶

Can Character Strengths Predict Racial Group Membership?¶

Overview and Research Context¶

Connection to Broader Research Program¶

1. Libraries and Setup¶

2. Data Loading and Preparation¶

2.1 Documenting the Class Imbalance¶

3. Feature Engineering and Train/Test Split¶

3.1 Train/Test Split¶

3.2 Feature Scaling¶

3.3 Handling Class Imbalance: class_weight='balanced'¶

4. Model Building: The Tournament¶

4.1 Model 1: Logistic Regression (Multinomial)¶

4.2 Model 2: Random Forest¶

4.3 Hyperparameter Discussion¶

5. Model Evaluation¶

5.1 Logistic Regression: Confusion Matrix and Classification Report¶

5.2 Random Forest: Confusion Matrix and Classification Report¶

5.3 Model Comparison¶

5.4 Cross-Validation¶

5.5 Interpretation¶

6. Feature Importance: Which Strengths Drive the Prediction?¶

Feature Importance Interpretation¶

7. Summary and Connection to Research Program¶

Key Takeaways¶

References¶

Acknowledgement of AI Assistance¶