Exploratory Data Analysis and Visualization in PYTHON

In this project, I used a dataset from a supermarket chain store. The dataset provides information, capturing various aspects of the business. I've conducted an analysis and explored the correlation between variables. The goal of this project was to uncover insights that could help the supermarket chain enhance its operations.

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

import calmap

from pandas_profiling import ProfileReport

file_path = "C:/Users/chris/OneDrive/Desktop/supermarket_sales.csv"

df = pd.read_csv(file_path)

#DATA EXPLORATION

num_rows = len(df)

print("Number of rows:", num_rows)

Number of rows: 1000

#Top 5 rows

df.head()

#Last 5 rows

df.tail()

df.columns

Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date', 'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Rating'], dtype='object')

df.dtypes

#Convert the values in the 'date' column to datetime format and set it as the index of the dataframe

df['Date'] = pd.to_datetime(df['Date'])

df.set_index('Date',inplace=True)

#Top 5 rows again

df.head()

df.describe() #Numerical columns

#UNIVARIATE ANALYSIS

#Distribution of customer ratings

sns.displot(df['Rating'])

#Univariate analysis for all columns:

df.hist(figsize =(10,10))

#Count of occurrences by branches

sns.countplot(data=df, x='Branch')

plt.xlabel('Branch')

plt.ylabel('Count')

plt.title('Count of Occurrences by Branch')

plt.show()

#Count of occurrences by payment

df['Branch'].value_counts()

A 340

B 332

C 328

Name: Branch, dtype: int64

sns.countplot(data=df, x='Payment')

plt.xlabel('Branch')

plt.ylabel('Count')

plt.show()

#BIVARIATE ANALYSIS

#Relationship between Gross Income and Customer Rating:

sns.regplot(x='Rating', y='gross income', data=df, line_kws={'color': 'red'})

#Based on the scatterplot above, there is no relationship between rating and gross income

#Between Branch and Gross Income:

sns.boxplot(x='Branch', y='gross income', data=df)

#Based on the boxplot, there no relationship between those two variables

#Between Gender and Gross Income:

sns.boxplot(x='Gender', y='gross income', data=df)

#Based on the boxplot on average both spend almost the same

#Time Trend in Gross Income

#To check for trends, the data must be aggregated by using the GROUPBY function based on the index, and the .mean() method calculates the mean value for each group.

grouped_df=df.groupby(df.index).mean()

plt.figure(figsize=(10, 6)) # Set the figure size

sns.lineplot(x=grouped_df.index, y=grouped_df['gross income'])

plt.xlabel('Index')

plt.ylabel('Mean Gross Income')

#CHECK FOR DUPLICATES

df.duplicated()

0 False

1 False

2 False

3 False

4 False

...

995 False

996 False

997 False

998 False

999 False

Length: 1000, dtype: bool

#Each value in the Boolean Series is False, indicating that none of the rows in the DataFrame are marked as duplicates.

#To verify that there are not duplicates in the dataset, lets run the following code:

df.duplicated().sum()

0 #No duplicated values

#CHECK FOR MISSING VALUES

df.isna().sum()

#Plot to verify that there are no missing values.

sns.heatmap(df.isnull(),cbar=False)

#The plot does not show any white lines in the box, indicating that there are no missing values in the dataset.

#CORRELATION ANALYSIS

df.corr()

#Round the correlation to 2 decimals:

np.round(df.corr(),2)

#Plot the correlation:

sns.heatmap(np.round(df.corr(),2), annot=True, cmap='coolwarm')

plt.title('Correlation Heatmap')

plt.xlabel('Features')

plt.ylabel('Features')