import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("Bike Sharing Dataset.csv")

# Print the shape of the dataset (rows, columns) to understand its size.
print("\n--- Shape of Dataset ---")
print(df.shape)

# Drop unneeded columns
df = df.drop(columns=['instant', 'casual', 'registered'])

# Display basic information about the dataset.
# This includes column names, non-null counts, and data types.
# It's useful for checking the integrity and structure of the data.
print("\n--- Dataset Information ---")
df.info()

# Display the first five rows of the dataset.
# Helps in gaining a quick look at the structure and sample values.
print("\n--- First five rows ---")
print(df.head())

# Summary statistics
# Provides descriptive statistics (count, mean, std, min, max, etc.)
# for both numerical and categorical columns.
# Rounded to 1 decimal for better readability.
print("\n--- Summary Statistics ---")
print(df.describe(include='all').round(1))

--- Shape of Dataset ---
(731, 16)

--- Dataset Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      731 non-null    object 
 1   season      731 non-null    int64  
 2   yr          731 non-null    int64  
 3   mnth        731 non-null    int64  
 4   holiday     731 non-null    int64  
 5   weekday     731 non-null    int64  
 6   workingday  731 non-null    int64  
 7   weathersit  731 non-null    int64  
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  cnt         731 non-null    int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 74.4+ KB

--- First five rows ---
       dteday  season  yr  mnth  holiday  weekday  workingday  weathersit  \
0  2011-01-01       1   0     1        0        6           0           2   
1  2011-01-02       1   0     1        0        0           0           2   
2  2011-01-03       1   0     1        0        1           1           1   
3  2011-01-04       1   0     1        0        2           1           1   
4  2011-01-05       1   0     1        0        3           1           1   

       temp     atemp       hum  windspeed   cnt  
0  0.344167  0.363625  0.805833   0.160446   985  
1  0.363478  0.353739  0.696087   0.248539   801  
2  0.196364  0.189405  0.437273   0.248309  1349  
3  0.200000  0.212122  0.590435   0.160296  1562  
4  0.226957  0.229270  0.436957   0.186900  1600  

--- Summary Statistics ---
            dteday  season     yr   mnth  holiday  weekday  workingday  \
count          731   731.0  731.0  731.0    731.0    731.0       731.0   
unique         731     NaN    NaN    NaN      NaN      NaN         NaN   
top     2011-01-01     NaN    NaN    NaN      NaN      NaN         NaN   
freq             1     NaN    NaN    NaN      NaN      NaN         NaN   
mean           NaN     2.5    0.5    6.5      0.0      3.0         0.7   
std            NaN     1.1    0.5    3.5      0.2      2.0         0.5   
min            NaN     1.0    0.0    1.0      0.0      0.0         0.0   
25%            NaN     2.0    0.0    4.0      0.0      1.0         0.0   
50%            NaN     3.0    1.0    7.0      0.0      3.0         1.0   
75%            NaN     3.0    1.0   10.0      0.0      5.0         1.0   
max            NaN     4.0    1.0   12.0      1.0      6.0         1.0   

        weathersit   temp  atemp    hum  windspeed     cnt  
count        731.0  731.0  731.0  731.0      731.0   731.0  
unique         NaN    NaN    NaN    NaN        NaN     NaN  
top            NaN    NaN    NaN    NaN        NaN     NaN  
freq           NaN    NaN    NaN    NaN        NaN     NaN  
mean           1.4    0.5    0.5    0.6        0.2  4504.3  
std            0.5    0.2    0.2    0.1        0.1  1937.2  
min            1.0    0.1    0.1    0.0        0.0    22.0  
25%            1.0    0.3    0.3    0.5        0.1  3152.0  
50%            1.0    0.5    0.5    0.6        0.2  4548.0  
75%            2.0    0.7    0.6    0.7        0.2  5956.0  
max            3.0    0.9    0.8    1.0        0.5  8714.0

# Check for missing values
# Identifies if there are any missing values in the dataset, which is crucial for data cleaning.
print("\nMissing Values:")
print(df.isnull().sum())

Missing Values:
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

# Visualizing distributions of numerical features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[numeric_cols].hist(bins=20, figsize=(15, 10))
plt.suptitle('Distributions of Numeric Features')
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(15, 15))
sns.heatmap(df[numeric_cols].corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Boxplot diagrams before handling outliers
print("\n--- Boxplots Before Handling Outliers ---")
num_plots = len(numeric_cols)
cols = 4
rows = (num_plots + cols - 1) // cols
plt.figure(figsize=(5 * cols, 4 * rows))
for i, col in enumerate(numeric_cols):
    plt.subplot(rows, cols, i + 1)
    sns.boxplot(y=df[col])
    plt.title(col, fontsize=10)
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Handling outliers (optional: use IQR method for numeric features)
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    if not outliers.empty:
        print(f"{col}: {len(outliers)} outliers detected")
        # Optional: clip or remove outliers
        df[col] = df[col].clip(lower_bound, upper_bound)

# Boxplot diagrams after handling outliers
print("\n--- Boxplots After Handling Outliers ---")
plt.figure(figsize=(5 * cols, 4 * rows))
for i, col in enumerate(numeric_cols):
    plt.subplot(rows, cols, i + 1)
    sns.boxplot(y=df[col])
    plt.title(col, fontsize=10)
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\n--- Dataset After Cleaning ---")
print(df.info())

# Save cleaned dataset
cleaned_file_path = 'Bike_Sharing_Cleaned.csv'
df.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to: {cleaned_file_path}")

--- Boxplots Before Handling Outliers ---

holiday: 21 outliers detected
hum: 2 outliers detected
windspeed: 13 outliers detected

--- Boxplots After Handling Outliers ---

--- Dataset After Cleaning ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      731 non-null    object 
 1   season      731 non-null    int64  
 2   yr          731 non-null    int64  
 3   mnth        731 non-null    int64  
 4   holiday     731 non-null    int64  
 5   weekday     731 non-null    int64  
 6   workingday  731 non-null    int64  
 7   weathersit  731 non-null    int64  
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  cnt         731 non-null    int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 74.4+ KB
None

Cleaned dataset saved to: Bike_Sharing_Cleaned.csv

Assignment 4: Creating Reports and Dashboards for Predictive and Prescriptive Analysis¶

Data Exploration and Preparation¶

Preparation¶

Mario Zamudio (NF1002499)¶

Ahmed Eltahawi¶

Loading Libraries to use¶

Loading Dataset and Basic information¶

Checking Missing Values¶

Reviewing Distribution of Numerical Variables¶

Data Cleaning¶

Boxplots Before Outlier Handling:¶

Outlier Detection and Treatment (IQR Method):¶

Boxplots After Outlier Handling:¶

Dataset Summary After Cleaning:¶

Saving the Cleaned Dataset:¶

Assignment 4: Creating Reports and Dashboards for Predictive and Prescriptive Analysis¶

Data Exploration and Preparation¶

Part 2: Prescriptive Analysis with Bike Sharing Dataset¶

Preparation¶

Mario Zamudio (NF1002499)¶

Ahmed Eltahawi¶

Loading Libraries to use¶

Loading Dataset and Basic information¶

Checking Missing Values¶

Reviewing Distribution of Numerical Variables¶

Data Cleaning¶

Boxplots Before Outlier Handling:¶

Outlier Detection and Treatment (IQR Method):¶

Boxplots After Outlier Handling:¶

Dataset Summary After Cleaning:¶

Saving the Cleaned Dataset:¶