In [40]:
import numpy as np
import pandas as pd
In [41]:
df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/Sample - Superstore.csv", encoding= "latin1") #engine ="python", on_bad_lines="skip", encoding = latin1)
df.head(5)
Out[41]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City ... Postal Code Region Product ID Category Sub-Category Product Name Sales Quantity Discount Profit
0 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520 Claire Gute Consumer United States Henderson ... 42420 South FUR-BO-10001798 Furniture Bookcases Bush Somerset Collection Bookcase 261.9600 2 0.00 41.9136
1 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520 Claire Gute Consumer United States Henderson ... 42420 South FUR-CH-10000454 Furniture Chairs Hon Deluxe Fabric Upholstered Stacking Chairs,... 731.9400 3 0.00 219.5820
2 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045 Darrin Van Huff Corporate United States Los Angeles ... 90036 West OFF-LA-10000240 Office Supplies Labels Self-Adhesive Address Labels for Typewriters b... 14.6200 2 0.00 6.8714
3 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... 33311 South FUR-TA-10000577 Furniture Tables Bretford CR4500 Series Slim Rectangular Table 957.5775 5 0.45 -383.0310
4 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale ... 33311 South OFF-ST-10000760 Office Supplies Storage Eldon Fold 'N Roll Cart System 22.3680 2 0.20 2.5164

5 rows × 21 columns

In [42]:
df =df.set_index("Row ID")
In [43]:
df.isnull().sum()
Out[43]:
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64
In [44]:
df[df.duplicated()]
Out[44]:
Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City State Postal Code Region Product ID Category Sub-Category Product Name Sales Quantity Discount Profit
Row ID
3407 US-2014-150119 4/23/2014 4/27/2014 Standard Class LB-16795 Laurel Beltran Home Office United States Columbus Ohio 43229 East FUR-CH-10002965 Furniture Chairs Global Leather Highback Executive Chair with P... 281.372 2 0.3 -12.0588
In [45]:
df = df.drop_duplicates()

Fix Data Types¶

In [46]:
date_cols= ["Order Date", "Ship Date"]
for col in date_cols:
    df[col]= pd.to_datetime(df[col], errors='coerce')
In [47]:
string_cols = ['Order ID','Ship Mode','Customer Name','Segment', 'Country', 'City', 'State', 'Postal Code',
       'Region','Category', 'Sub-Category', 'Product Name','Customer ID']
for string_col in string_cols :
    df[string_col] = df[string_col].astype("string")

Sales by Category¶

In [71]:
df.groupby("Category")["Sales"].sum()
Out[71]:
Category
Furniture          741718.4233
Office Supplies    719047.0320
Technology         836154.0330
Name: Sales, dtype: float64

Average Discount by Category¶

In [83]:
df.groupby("Category")["Discount"].mean().round(2)
Out[83]:
Category
Furniture          0.17
Office Supplies    0.16
Technology         0.13
Name: Discount, dtype: float64

Sales & Profit by Region¶

In [70]:
df.groupby("Region")[["Sales","Profit"]].sum()
Out[70]:
Sales Profit
Region
Central 501239.8908 39706.3625
East 678499.8680 91534.8388
South 391721.9050 46749.4303
West 725457.8245 108418.4489

TOp 5 Customers¶

In [86]:
df.groupby("Customer Name")["Sales"].sum().sort_values(ascending = False).head(5)
Out[86]:
Customer Name
Sean Miller      25043.050
Tamara Chand     19052.218
Raymond Buch     15117.339
Tom Ashbrook     14595.620
Adrian Barton    14473.571
Name: Sales, dtype: float64

Sales by ship mode¶

In [88]:
df.groupby("Ship Mode")["Sales"].sum()
Out[88]:
Ship Mode
First Class       3.514284e+05
Same Day          1.283631e+05
Second Class      4.591936e+05
Standard Class    1.357934e+06
Name: Sales, dtype: float64

Margin by segment¶

In [99]:
Segment_margin = (df.groupby("Segment")[["Sales", "Profit"]].sum())
Segment_margin["Margin %"] = Segment_margin["Profit"] / Segment_margin["Sales"]
Segment_margin.sort_values("Margin %", ascending=False).head(10).round(2)
Out[99]:
Sales Profit Margin %
Segment
Home Office 429371.78 60310.74 0.14
Corporate 706146.37 91979.13 0.13
Consumer 1161401.34 134119.21 0.12