import numpy as np
import pandas as pd
df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/Sample - Superstore.csv", encoding= "latin1") #engine ="python", on_bad_lines="skip", encoding = latin1)
df.head(5)
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | ... | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2016-152156 | 11/8/2016 | 11/11/2016 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | 2 | CA-2016-152156 | 11/8/2016 | 11/11/2016 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | ... | 42420 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | 3 | CA-2016-138688 | 6/12/2016 | 6/16/2016 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | ... | 90036 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | 4 | US-2015-108966 | 10/11/2015 | 10/18/2015 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | 5 | US-2015-108966 | 10/11/2015 | 10/18/2015 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | ... | 33311 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
5 rows × 21 columns
df =df.set_index("Row ID")
df.isnull().sum()
Order ID 0 Order Date 0 Ship Date 0 Ship Mode 0 Customer ID 0 Customer Name 0 Segment 0 Country 0 City 0 State 0 Postal Code 0 Region 0 Product ID 0 Category 0 Sub-Category 0 Product Name 0 Sales 0 Quantity 0 Discount 0 Profit 0 dtype: int64
df[df.duplicated()]
| Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Row ID | ||||||||||||||||||||
| 3407 | US-2014-150119 | 4/23/2014 | 4/27/2014 | Standard Class | LB-16795 | Laurel Beltran | Home Office | United States | Columbus | Ohio | 43229 | East | FUR-CH-10002965 | Furniture | Chairs | Global Leather Highback Executive Chair with P... | 281.372 | 2 | 0.3 | -12.0588 |
df = df.drop_duplicates()
date_cols= ["Order Date", "Ship Date"]
for col in date_cols:
df[col]= pd.to_datetime(df[col], errors='coerce')
string_cols = ['Order ID','Ship Mode','Customer Name','Segment', 'Country', 'City', 'State', 'Postal Code',
'Region','Category', 'Sub-Category', 'Product Name','Customer ID']
for string_col in string_cols :
df[string_col] = df[string_col].astype("string")
df.groupby("Category")["Sales"].sum()
Category Furniture 741718.4233 Office Supplies 719047.0320 Technology 836154.0330 Name: Sales, dtype: float64
df.groupby("Category")["Discount"].mean().round(2)
Category Furniture 0.17 Office Supplies 0.16 Technology 0.13 Name: Discount, dtype: float64
df.groupby("Region")[["Sales","Profit"]].sum()
| Sales | Profit | |
|---|---|---|
| Region | ||
| Central | 501239.8908 | 39706.3625 |
| East | 678499.8680 | 91534.8388 |
| South | 391721.9050 | 46749.4303 |
| West | 725457.8245 | 108418.4489 |
df.groupby("Customer Name")["Sales"].sum().sort_values(ascending = False).head(5)
Customer Name Sean Miller 25043.050 Tamara Chand 19052.218 Raymond Buch 15117.339 Tom Ashbrook 14595.620 Adrian Barton 14473.571 Name: Sales, dtype: float64
df.groupby("Ship Mode")["Sales"].sum()
Ship Mode First Class 3.514284e+05 Same Day 1.283631e+05 Second Class 4.591936e+05 Standard Class 1.357934e+06 Name: Sales, dtype: float64
Segment_margin = (df.groupby("Segment")[["Sales", "Profit"]].sum())
Segment_margin["Margin %"] = Segment_margin["Profit"] / Segment_margin["Sales"]
Segment_margin.sort_values("Margin %", ascending=False).head(10).round(2)
| Sales | Profit | Margin % | |
|---|---|---|---|
| Segment | |||
| Home Office | 429371.78 | 60310.74 | 0.14 |
| Corporate | 706146.37 | 91979.13 | 0.13 |
| Consumer | 1161401.34 | 134119.21 | 0.12 |