import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv("Bike_Sharing_Cleaned.csv")
df.head()

# Drop unneeded columns
df = df.drop(columns=['dteday'])

## Separate features (X) and target variable (y)
X = df.drop('cnt', axis=1)
y = df['cnt']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and Train Models
lr_model = LinearRegression().fit(X_train, y_train)
dt_model = DecisionTreeRegressor(random_state=42).fit(X_train, y_train)
rf_model = RandomForestRegressor(random_state=42).fit(X_train, y_train)
gb_model = GradientBoostingRegressor(random_state=42).fit(X_train, y_train)

# Make Predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_gb = gb_model.predict(X_test)

# Evaluate Models
results_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting'],
    'MSE': [
        mean_squared_error(y_test, y_pred_lr),
        mean_squared_error(y_test, y_pred_dt),
        mean_squared_error(y_test, y_pred_rf),
        mean_squared_error(y_test, y_pred_gb)
    ],
    'R2 Score': [
        r2_score(y_test, y_pred_lr),
        r2_score(y_test, y_pred_dt),
        r2_score(y_test, y_pred_rf),
        r2_score(y_test, y_pred_gb)
    ]
})
results_df

# Feature Importance from Random Forest
feature_importance_rf = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 3))
sns.barplot(data=feature_importance_rf.head(10), x='Importance', y='Feature')
plt.title('Top 10 Feature Importances from Random Forest')
plt.tight_layout()
plt.show()

# Feature Importance from Decission Tree
feature_importance_rf = pd.DataFrame({
    'Feature': X.columns,
    'Importance': dt_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 3))
sns.barplot(data=feature_importance_rf.head(10), x='Importance', y='Feature')
plt.title('Top 10 Feature Importances from Decission Tree')
plt.tight_layout()
plt.show()

# Feature Importance from Gradient Boost
feature_importance_rf = pd.DataFrame({
    'Feature': X.columns,
    'Importance': gb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 3))
sns.barplot(data=feature_importance_rf.head(10), x='Importance', y='Feature')
plt.title('Top 10 Feature Importances from Gradient Boost')
plt.tight_layout()
plt.show()

correlation = df.corr(numeric_only=True)
plt.figure(figsize=(12, 8))
sns.heatmap(
    correlation[['cnt']].sort_values(by='cnt', ascending=False),
    annot=True,
    cmap='coolwarm',
    fmt=".2f"
)
plt.title('Correlation of Features with Count Sharings')
plt.tight_layout()
plt.show()

top_corr_features = (
    df.corr(numeric_only=True)['cnt']
    .drop('cnt')
    .sort_values(ascending=False)
    .head(5)
    .index
    .tolist()
)

plt.figure(figsize=(18, 20))
for i, feature in enumerate(top_corr_features):
    plt.subplot(5, 2, i + 1)
    sns.scatterplot(data=df, x=feature, y='cnt', alpha=0.6)
    plt.title(f'{feature} vs cnt')
plt.tight_layout()
plt.show()

X_top5 = df[top_corr_features]
y_top5 = df['cnt']

X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(
    X_top5, y_top5, test_size=0.2, random_state=42
)

lr_5 = LinearRegression().fit(X_train_5, y_train_5)
dt_5 = DecisionTreeRegressor(random_state=42).fit(X_train_5, y_train_5)
rf_5 = RandomForestRegressor(random_state=42).fit(X_train_5, y_train_5)
gb_5 = GradientBoostingRegressor(random_state=42).fit(X_train_5, y_train_5)

y_pred_lr_5 = lr_5.predict(X_test_5)
y_pred_dt_5 = dt_5.predict(X_test_5)
y_pred_rf_5 = rf_5.predict(X_test_5)
y_pred_gb_5 = gb_5.predict(X_test_5)

results_top5 = {
    'Model': ['Linear Regression (Top 5)', 'Decision Tree (Top 5)',
              'Random Forest (Top 5)', 'Gradient Boosting (Top 5)'],
    'MSE': [
        mean_squared_error(y_test_5, y_pred_lr_5),
        mean_squared_error(y_test_5, y_pred_dt_5),
        mean_squared_error(y_test_5, y_pred_rf_5),
        mean_squared_error(y_test_5, y_pred_gb_5)
    ],
    'R2 Score': [
        r2_score(y_test_5, y_pred_lr_5),
        r2_score(y_test_5, y_pred_dt_5),
        r2_score(y_test_5, y_pred_rf_5),
        r2_score(y_test_5, y_pred_gb_5)
    ]
}

combined_results = pd.concat(
    [results_df, pd.DataFrame(results_top5)],
    axis=0
).reset_index(drop=True)

combined_results

# Bar Chart: R² Score Comparison (Future-Proof)
plt.figure(figsize=(12, 6))
sns.barplot(
    data=combined_results,
    x='Model',
    y='R2 Score',
    hue='Model',
    palette='Blues_d',
    legend=False
)
plt.title('R² Score Comparison of Models (Full vs Top 5 Features)')
plt.ylabel('R² Score')
plt.xlabel('Model')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Bar Chart: MSE Comparison (Future-Proof)
plt.figure(figsize=(12, 6))
sns.barplot(
    data=combined_results,
    x='Model',
    y='MSE',
    hue='Model',
    palette='Reds_d',
    legend=False
)
plt.title('MSE Comparison of Models (Full vs Top 10 Features)')
plt.ylabel('Mean Squared Error')
plt.xlabel('Model')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

	dteday	season	mnth	weekday	workingday	weathersit	temp	atemp	hum	windspeed	cnt
0	2011-01-01	1	1	6	0	2	0.344167	0.363625	0.805833	0.160446	985
1	2011-01-02	1	1	0	0	2	0.363478	0.353739	0.696087	0.248539	801
2	2011-01-03	1	1	1	1	1	0.196364	0.189405	0.437273	0.248309	1349
3	2011-01-04	1	1	2	1	1	0.200000	0.212122	0.590435	0.160296	1562
4	2011-01-05	1	1	3	1	1	0.226957	0.229270	0.436957	0.186900	1600

	Model	MSE	R2 Score
0	Linear Regression	7.125135e+05	0.822311
1	Decision Tree	1.017923e+06	0.746146
2	Random Forest	4.618915e+05	0.884812
3	Gradient Boosting	4.289112e+05	0.893036

	Model	MSE	R2 Score
0	Linear Regression	7.125135e+05	0.822311
1	Decision Tree	1.017923e+06	0.746146
2	Random Forest	4.618915e+05	0.884812
3	Gradient Boosting	4.289112e+05	0.893036
4	Linear Regression (Top 5)	9.593554e+05	0.760752
5	Decision Tree (Top 5)	1.171677e+06	0.707803
6	Random Forest (Top 5)	8.388508e+05	0.790804
7	Gradient Boosting (Top 5)	8.414969e+05	0.790144

Assignment 4: Creating Reports and Dashboards for Predictive and Prescriptive Analysis¶

Data Exploration and Preparation¶

Prediction¶

Mario Zamudio (NF1002499)¶

Ahmed Eltahawi¶

Loading Libraries to use¶

Loading Dataset and Basic information¶

Defining the dataset¶

Initializing and Training Regression Models on the Full Dataset¶

Predictions and Evaluation¶

Visualizing the Most Influential Features in the Models¶

Analyzing Feature Relationships with Sale Price Using a Correlation Heatmap¶

Exploring the Top 5 Features Most Correlated with Quantity of Sharings¶

Model Evaluation Using Only the Top 5 Correlated Features¶

Visual Comparison of R² Scores Across Models¶

Visual Comparison of Mean Squared Error (MSE) Across Models¶

Assignment 4: Creating Reports and Dashboards for Predictive and Prescriptive Analysis¶

Data Exploration and Preparation¶

Part 2: Prescriptive Analysis with Bike Sharing Dataset¶

Prediction¶

Mario Zamudio (NF1002499)¶

Ahmed Eltahawi¶

Loading Libraries to use¶

Loading Dataset and Basic information¶

Defining the dataset¶

Initializing and Training Regression Models on the Full Dataset¶

Predictions and Evaluation¶

Visualizing the Most Influential Features in the Models¶

Analyzing Feature Relationships with Sale Price Using a Correlation Heatmap¶

Exploring the Top 5 Features Most Correlated with Quantity of Sharings¶

Model Evaluation Using Only the Top 5 Correlated Features¶

Visual Comparison of R² Scores Across Models¶

Visual Comparison of Mean Squared Error (MSE) Across Models¶