Loading Libraries to use¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Loading Dataset and Basic information¶
In [2]:
# Load the dataset
# The dataset is loaded from a CSV file into a Pandas DataFrame.
df = pd.read_csv("iris.csv")
# Display basic information about the dataset
# This provides an overview of the dataset, including column names, data types, and non-null counts.
print("Dataset Information:")
df.info()
# Display the first few rows of the dataset
# Helps in understanding the structure and contents of the dataset.
print("\nFirst five rows:")
print(df.head())
# Summary statistics
# Provides descriptive statistics such as mean, standard deviation, min, and max for numerical columns.
print("\nSummary Statistics:")
print(df.describe())
Dataset Information: <class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal_length 150 non-null float64 1 sepal_width 150 non-null float64 2 petal_length 150 non-null float64 3 petal_width 150 non-null float64 4 species 150 non-null object dtypes: float64(4), object(1) memory usage: 6.0+ KB First five rows: sepal_length sepal_width petal_length petal_width species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa Summary Statistics: sepal_length sepal_width petal_length petal_width count 150.000000 150.000000 150.000000 150.000000 mean 5.843333 3.057333 3.758000 1.199333 std 0.828066 0.435866 1.765298 0.762238 min 4.300000 2.000000 1.000000 0.100000 25% 5.100000 2.800000 1.600000 0.300000 50% 5.800000 3.000000 4.350000 1.300000 75% 6.400000 3.300000 5.100000 1.800000 max 7.900000 4.400000 6.900000 2.500000
Checking Missing Values¶
In [3]:
# Check for missing values
# Identifies if there are any missing values in the dataset, which is crucial for data cleaning.
print("\nMissing Values:")
print(df.isnull().sum())
Missing Values: sepal_length 0 sepal_width 0 petal_length 0 petal_width 0 species 0 dtype: int64
Reviewing Frequency of Categorical Variables¶
In [10]:
# Display distinct values and frequencies for categorical variables
# Helps in understanding the distribution of categorical data.
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
print(f"\nDistinct values and their frequency in '{col}':")
print(df[col].value_counts())
Distinct values and their frequency in 'species': species setosa 50 versicolor 50 virginica 50 Name: count, dtype: int64
Reviewing Distribution of Numerical Variables¶
In [4]:
# Visualizing the distribution of numeric variables
# The pairplot helps identify relationships between numerical features and highlights clustering among different species.
sns.pairplot(df, hue="species")
plt.show()
Boxplots to identify outliers in the numeric variables¶
In [16]:
# Identifying outliers using the IQR method (only for numerical columns)
# The IQR (Interquartile Range) method is used to detect outliers by checking values beyond 1.5 times the IQR.
numeric_cols = df.select_dtypes(include=[np.number]).columns
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()
print("\nNumber of outliers in each numerical column:")
print(outliers)
# Checking for outliers using boxplots
# Boxplots help in detecting outliers by visualizing the spread of the data and extreme values.
plt.figure(figsize=(12, 6))
for i, column in enumerate(df.columns[:-1]):
plt.subplot(2, 2, i + 1)
sns.boxplot(x=df[column])
plt.title(f"Boxplot of {column}")
plt.tight_layout()
plt.show()
Number of outliers in each numerical column: sepal_length 0 sepal_width 4 petal_length 0 petal_width 0 dtype: int64
Histograms to review the frequency and behavior¶
In [7]:
# Plot histograms for numerical columns
# Histograms help in understanding the distribution of values in each numerical feature.
plt.figure(figsize=(12, 6))
for i, column in enumerate(df.select_dtypes(include=[np.number]).columns):
plt.subplot(2, 2, i + 1)
sns.histplot(df[column], bins=20, kde=True)
plt.title(f"Histogram of {column}")
plt.tight_layout()
plt.show()
Headmap Matrix to review correlation between variables¶
In [15]:
# Correlation matrix
# The correlation matrix helps in identifying the strength and direction of relationships between numerical variables.
plt.figure(figsize=(10, 8))
corr_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
In [14]:
Number of outliers in each numerical column: sepal_length 0 sepal_width 4 petal_length 0 petal_width 0 dtype: int64
In [ ]: