import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
# The dataset is loaded from a CSV file into a Pandas DataFrame.
df = pd.read_csv("iris.csv")

# Display basic information about the dataset
# This provides an overview of the dataset, including column names, data types, and non-null counts.
print("Dataset Information:")
df.info()

# Display the first few rows of the dataset
# Helps in understanding the structure and contents of the dataset.
print("\nFirst five rows:")
print(df.head())

# Summary statistics
# Provides descriptive statistics such as mean, standard deviation, min, and max for numerical columns.
print("\nSummary Statistics:")
print(df.describe())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

First five rows:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Summary Statistics:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

# Check for missing values
# Identifies if there are any missing values in the dataset, which is crucial for data cleaning.
print("\nMissing Values:")
print(df.isnull().sum())

Missing Values:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

# Display distinct values and frequencies for categorical variables
# Helps in understanding the distribution of categorical data.
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    print(f"\nDistinct values and their frequency in '{col}':")
    print(df[col].value_counts())

Distinct values and their frequency in 'species':
species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

# Visualizing the distribution of numeric variables
# The pairplot helps identify relationships between numerical features and highlights clustering among different species.
sns.pairplot(df, hue="species")
plt.show()

# Identifying outliers using the IQR method (only for numerical columns)
# The IQR (Interquartile Range) method is used to detect outliers by checking values beyond 1.5 times the IQR.
numeric_cols = df.select_dtypes(include=[np.number]).columns
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()
print("\nNumber of outliers in each numerical column:")
print(outliers)

# Checking for outliers using boxplots
# Boxplots help in detecting outliers by visualizing the spread of the data and extreme values.
plt.figure(figsize=(12, 6))
for i, column in enumerate(df.columns[:-1]):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(x=df[column])
    plt.title(f"Boxplot of {column}")
plt.tight_layout()
plt.show()

Number of outliers in each numerical column:
sepal_length    0
sepal_width     4
petal_length    0
petal_width     0
dtype: int64

# Plot histograms for numerical columns
# Histograms help in understanding the distribution of values in each numerical feature.
plt.figure(figsize=(12, 6))
for i, column in enumerate(df.select_dtypes(include=[np.number]).columns):
    plt.subplot(2, 2, i + 1)
    sns.histplot(df[column], bins=20, kde=True)
    plt.title(f"Histogram of {column}")
plt.tight_layout()
plt.show()

# Correlation matrix
# The correlation matrix helps in identifying the strength and direction of relationships between numerical variables.
plt.figure(figsize=(10, 8))
corr_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

Number of outliers in each numerical column:
sepal_length    0
sepal_width     4
petal_length    0
petal_width     0
dtype: int64

Assignment 3: Creating Reports and Dashboards for Data Visualization and Advanced Analysis¶

Data Exploration and Preparation¶

Mario Zamudio (NF1002499)¶

Ahmed Eltahawi¶

Loading Libraries to use¶

Loading Dataset and Basic information¶

Checking Missing Values¶

Reviewing Frequency of Categorical Variables¶

Reviewing Distribution of Numerical Variables¶

Boxplots to identify outliers in the numeric variables¶

Histograms to review the frequency and behavior¶

Headmap Matrix to review correlation between variables¶