import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import statsmodels.api as sm
import scipy.stats as stats

import sklearn
from sklearn import metrics
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor  # Updated
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Ignore specific warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the dataset
df = pd.read_csv("Events by Day Traffic Colission Toronto5.csv")

# check the data types of the data
df.info()

# Rename columns to remove spaces and convert to uppercase
df.columns = [col.strip().upper().replace(" ", "_") for col in df.columns]

# Print data sample
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75089 entries, 0 to 75088
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   OCC_DATE.1          75089 non-null  object 
 1   Year                75089 non-null  int64  
 2   NEIGHBOURHOOD_158   75089 non-null  object 
 3   avg_temperature     75089 non-null  float64
 4   0 or 1 snow         75089 non-null  int64  
 5   avg_wind_speed      75089 non-null  float64
 6   avg_visibility      75089 non-null  int64  
 7   snow                74708 non-null  float64
 8   snow_on_ground      75089 non-null  int64  
 9    Number of events   75089 non-null  int64  
 10  Longitud            75089 non-null  float64
 11  Latitude            75089 non-null  float64
 12  AVG_Precipitation   74862 non-null  float64
 13  Day_of_the_week     75089 non-null  object 
dtypes: float64(6), int64(5), object(3)
memory usage: 8.0+ MB

# Remove highly correlated variable PRECIPITATION
if 'AVG_PRECIPITATION' in df.columns:
    df.drop(columns=['AVG_PRECIPITATION'], inplace=True)
if 'YEAR' in df.columns:
    df.drop(columns=['YEAR'], inplace=True)
if 'OCC_DATE.1' in df.columns:
    df.drop(columns=['OCC_DATE.1'], inplace=True)
if 'DAY_OF_THE_WEEK' in df.columns:
    df.drop(columns=['DAY_OF_THE_WEEK'], inplace=True)
if 'NEIGHBOURHOOD_158' in df.columns:
    df.drop(columns=['NEIGHBOURHOOD_158'], inplace=True)

# Identify columns with missing values and remove rows with missing values
missing_values = df.isnull().sum()
columns_with_na = missing_values[missing_values > 0].index.tolist()
print("Columns with missing values before cleaning:", columns_with_na)
# Reemplazar valores nulos en la columna SNOW con 0
df['SNOW'].fillna(0, inplace=True)

df.dropna(inplace=True)
missing_values = df.isnull().sum()
columns_with_na = missing_values[missing_values > 0].index.tolist()
print("Columns with missing values after cleaning:", columns_with_na)

Columns with missing values before cleaning: ['SNOW']
Columns with missing values after cleaning: []

print(df.describe())

       AVG_TEMPERATURE   0_OR_1_SNOW  AVG_WIND_SPEED  AVG_VISIBILITY  \
count     75089.000000  75089.000000    75089.000000    75089.000000   
mean          9.396603      0.087217       16.551332    20095.814833   
std          10.068853      0.282154        6.417941     4634.173968   
min         -17.100000      0.000000        5.000000     3200.000000   
25%           1.250000      0.000000       11.500000    15250.000000   
50%           9.600000      0.000000       15.500000    21700.000000   
75%          18.450000      0.000000       20.500000    24100.000000   
max          28.650000      1.000000       47.000000    28150.000000   

               SNOW  SNOW_ON_GROUND  NUMBER_OF_EVENTS      LONGITUD  \
count  75089.000000    75089.000000      75089.000000  75089.000000   
mean       0.398514        1.420967          1.787159    -79.394091   
std        1.934940        4.330962          1.213243      0.100299   
min        0.000000        0.000000          1.000000    -79.639247   
25%        0.000000        0.000000          1.000000    -79.466036   
50%        0.000000        0.000000          1.000000    -79.397319   
75%        0.000000        0.000000          2.000000    -79.324371   
max       29.000000       32.000000         29.000000    -79.122044   

           LATITUDE  
count  75089.000000  
mean      43.710761  
std        0.053506  
min       43.586487  
25%       43.665873  
50%       43.706910  
75%       43.757230  
max       43.844027

# Visualization of histograms to validate variable distribution
numeric_columns = df.select_dtypes(include=['number']).columns
plt.figure(figsize=(15, 12))
for i, col in enumerate(numeric_columns[:12]):
    plt.subplot(4, 3, i + 1)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# Check correlation between variables and generate a heatmap
numeric_df = df.select_dtypes(include=['number']).dropna()
if not numeric_df.empty:
    plt.figure(figsize=(12,8))
    sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
    plt.title("Feature Correlation Heatmap")
    plt.show()

# Visualize distributions of numerical features with normality checks
numerical_cols = df.select_dtypes(include=['number']).columns

for col in numerical_cols:
    plt.figure(figsize=(12, 6))

    # Q-Q plot
    plt.subplot(1, 2, 2)
    stats.probplot(df[col], dist="norm", plot=plt)
    plt.title(f'Q-Q Plot of {col}')
    plt.tight_layout()
    plt.show()

    # Shapiro-Wilk test for normality
    shapiro_test = stats.shapiro(df[col].dropna()) # Drop NA values for the test
    print(f"Shapiro-Wilk Test for {col}:")
    print(f"Statistic: {shapiro_test.statistic:.3f}, p-value: {shapiro_test.pvalue:.3f}")
    if shapiro_test.pvalue > 0.05:
        print(f"The data for '{col}' likely comes from a normal distribution.")
    else:
        print(f"The data for '{col}' likely does not come from a normal distribution.")
    print("---")

Shapiro-Wilk Test for AVG_TEMPERATURE:
Statistic: 0.968, p-value: 0.000
The data for 'AVG_TEMPERATURE' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for 0_OR_1_SNOW:
Statistic: 0.316, p-value: 0.000
The data for '0_OR_1_SNOW' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for AVG_WIND_SPEED:
Statistic: 0.956, p-value: 0.000
The data for 'AVG_WIND_SPEED' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for AVG_VISIBILITY:
Statistic: 0.793, p-value: 0.000
The data for 'AVG_VISIBILITY' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for SNOW:
Statistic: 0.212, p-value: 0.000
The data for 'SNOW' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for SNOW_ON_GROUND:
Statistic: 0.378, p-value: 0.000
The data for 'SNOW_ON_GROUND' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for NUMBER_OF_EVENTS:
Statistic: 0.674, p-value: 0.000
The data for 'NUMBER_OF_EVENTS' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for LONGITUD:
Statistic: 0.990, p-value: 0.000
The data for 'LONGITUD' likely does not come from a normal distribution.
---

Shapiro-Wilk Test for LATITUDE:
Statistic: 0.974, p-value: 0.000
The data for 'LATITUDE' likely does not come from a normal distribution.
---

# Define variable independents X and variable dependent y
y = pd.to_numeric(df['NUMBER_OF_EVENTS'], errors='coerce')
X = df.drop(columns=['NUMBER_OF_EVENTS'], errors='ignore')

# Align the values
X, y = X.align(y, join='inner', axis=0)

# Multiple Regression Analysis
X_const = sm.add_constant(X)
if X_const.shape[0] > X_const.shape[1]:
    model = sm.OLS(y, X_const).fit()
    print(model.summary())
else:
    print("Not enough data points for multiple regression analysis.")

                            OLS Regression Results                            
==============================================================================
Dep. Variable:       NUMBER_OF_EVENTS   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     63.94
Date:                Mon, 10 Mar 2025   Prob (F-statistic):          5.64e-105
Time:                        20:30:06   Log-Likelihood:            -1.2081e+05
No. Observations:               75089   AIC:                         2.416e+05
Df Residuals:                   75080   BIC:                         2.417e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -20.2314      6.785     -2.982      0.003     -33.531      -6.932
AVG_TEMPERATURE     0.0030      0.001      5.610      0.000       0.002       0.004
0_OR_1_SNOW         0.0681      0.023      2.941      0.003       0.023       0.113
AVG_WIND_SPEED     -0.0011      0.001     -1.586      0.113      -0.003       0.000
AVG_VISIBILITY  -1.274e-06   1.08e-06     -1.183      0.237   -3.38e-06    8.36e-07
SNOW                0.0337      0.003     10.999      0.000       0.028       0.040
SNOW_ON_GROUND      0.0025      0.001      2.117      0.034       0.000       0.005
LONGITUD            0.2118      0.049      4.292      0.000       0.115       0.309
LATITUDE            0.8883      0.093      9.603      0.000       0.707       1.070
==============================================================================
Omnibus:                    45208.571   Durbin-Watson:                   1.686
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           726301.185
Skew:                           2.613   Prob(JB):                         0.00
Kurtosis:                      17.312   Cond. No.                     3.17e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.17e+07. This might indicate that there are
strong multicollinearity or other numerical problems.

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training multiple models with optimized parameters
models = {
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42),
    "Linear Regression": LinearRegression()
}
results = {}

def evaluate_model(model, X_test, y_test):
    num_bins=4
    y_pred = model.predict(X_test)

    # Regression metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error (MSE): {mse:.4f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
    print(f'R-squared (R2): {r2:.4f}')

    # ... (Optional: Scatter plot of predicted vs. actual values) ...
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred)
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title("Actual vs. Predicted Values")
    plt.show()

    # Import confusion_matrix if not already imported
    from sklearn.metrics import confusion_matrix

    # Discretize predictions and actual values into bins
    # Added duplicates='drop' to handle non-unique bin edges
    y_test_bins = pd.qcut(y_test, q=num_bins, labels=False, duplicates='drop')
    y_pred_bins = pd.qcut(y_pred, q=num_bins, labels=False, duplicates='drop')


    # Create confusion matrix
    cm = confusion_matrix(y_test_bins, y_pred_bins)

    # Plotting the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted Bins")
    plt.ylabel("Actual Bins")
    plt.title("Confusion Matrix for Regression")
    plt.show()

# Evaluation Models
for name, model in models.items():
    # Changed line below: Pass X_train and y_train as arguments to fit
    model.fit(X_train, y_train)
    print(f"\n{name} Model Evaluation")
    evaluate_model(model, X_test, y_test)
    joblib.dump(model, f'{name.replace(" ", "_").lower()}_model.pkl')
    results[name] = np.sqrt(mean_squared_error(y_test, model.predict(X_test))) # Store RMSE

Random Forest Model Evaluation
Mean Squared Error (MSE): 0.9550
Root Mean Squared Error (RMSE): 0.9772
R-squared (R2): 0.3503

Gradient Boosting Model Evaluation
Mean Squared Error (MSE): 1.1241
Root Mean Squared Error (RMSE): 1.0602
R-squared (R2): 0.2353

Linear Regression Model Evaluation
Mean Squared Error (MSE): 1.4583
Root Mean Squared Error (RMSE): 1.2076
R-squared (R2): 0.0080

# 20. Model comparison
plt.figure(figsize=(8,5))
plt.bar(results.keys(), results.values(), color=['blue', 'green', 'red', 'purple', 'gray'])
plt.xlabel("Model")
plt.ylabel("RMSE") # Change y-axis label
plt.title("Model Performance Comparison (RMSE)") # Update title
plt.xticks(rotation=45)
plt.show()

# Test with sample input data
random_indices = np.random.choice(len(X_test), 5, replace=False)
sample_data = X_test.iloc[random_indices]

print("Sample input data:")
print(sample_data)

for name, model in models.items():
    loaded_model = joblib.load(f'{name.replace(" ", "_").lower()}_model.pkl')
    predictions = loaded_model.predict(sample_data)
    print(f"\nPredictions from {name}:", predictions)

Sample input data:
       AVG_TEMPERATURE  0_OR_1_SNOW  AVG_WIND_SPEED  AVG_VISIBILITY  SNOW  \
10817            17.70            0            13.5           24100   0.0   
17820            21.70            0            11.0           20100   0.0   
9311             10.70            0            16.5           24100   0.0   
34908             2.15            0            28.0           24100   0.0   
1295            -13.75            0            18.0           24100   0.0   

       SNOW_ON_GROUND   LONGITUD   LATITUDE  
10817               0 -79.449384  43.676476  
17820               0 -79.401234  43.660649  
9311                0 -79.297861  43.776520  
34908               1 -79.560010  43.646340  
1295               25 -79.459948  43.674131  

Predictions from Random Forest: [1.61907005 1.6215865  1.67272156 1.13491531 1.6276811 ]

Predictions from Gradient Boosting: [1.56582782 1.59356405 2.44570647 1.27388694 1.64345978]

Predictions from Linear Regression: [1.74406004 1.76640872 1.84151488 1.62656155 1.68973751]

	OCC_DATE.1	YEAR	NEIGHBOURHOOD_158	AVG_TEMPERATURE	0_OR_1_SNOW	AVG_WIND_SPEED	AVG_VISIBILITY	SNOW	SNOW_ON_GROUND	NUMBER_OF_EVENTS	LONGITUD	LATITUDE	AVG_PRECIPITATION	DAY_OF_THE_WEEK
0	2022-01-01	2022	St Lawrence-East Bayfront-The Islands (166)	0.64	1	18.0	12250	3.0	3	5	-79.375417	43.643198	2.4	Saturday
1	2022-01-01	2022	Alderwood (20)	0.64	1	18.0	12250	3.0	3	1	-79.548316	43.609070	2.4	Saturday
2	2022-01-01	2022	South Riverdale (70)	0.64	1	18.0	12250	3.0	3	2	-79.337223	43.662688	2.4	Saturday
3	2022-01-01	2022	Kingsview Village-The Westway (6)	0.64	1	18.0	12250	3.0	3	1	-79.552151	43.695790	2.4	Saturday
4	2022-01-01	2022	Yonge-St.Clair (97)	0.64	1	18.0	12250	3.0	3	1	-79.401513	43.686701	2.4	Saturday
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
75084	2024-06-30	2024	Englemount-Lawrence (32)	17.70	0	26.5	15250	0.0	0	1	-79.434161	43.729214	0.6	Sunday
75085	2024-06-30	2024	Danforth (66)	17.70	0	26.5	15250	0.0	0	2	-79.325398	43.682977	0.6	Sunday
75086	2024-06-30	2024	Lansing-Westgate (38)	17.70	0	26.5	15250	0.0	0	1	-79.432825	43.756630	0.6	Sunday
75087	2024-06-30	2024	Kingsview Village-The Westway (6)	17.70	0	26.5	15250	0.0	0	1	-79.545151	43.697425	0.6	Sunday
75088	2024-06-30	2024	Yorkdale-Glen Park (31)	17.70	0	26.5	15250	0.0	0	1	-79.451306	43.723657	0.6	Sunday

Group 4 - Winter 2025 Predictive Analytics (DAMO-510-6)¶

Dataset Sources¶

DataSet Description¶

DataSet Preparation¶

Imports and Definitions¶

Load and Initial configurations¶

Data Cleaning¶

Data Exploration¶

Dataset description¶

Use Cases¶

Basic Visualizations¶

Correlation Analysis¶

QQ Plots¶

Use Cases¶

Regression Analysis¶

Use Cases¶

Model Preparation and Training¶

Splitting DataSet¶

Use Cases¶

Defining Models to Train and Test¶

Function to evaluate the Models¶

Use Cases¶

Model Evaluation¶

Use Cases¶

Model Comparition¶

Use Cases¶

Using the Models¶

Use Cases¶