import numpy as np
import pandas as pd


df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/Food_Inspections_20250628.csv", engine ="python", on_bad_lines="skip")
df.head(5)


df.rename(columns={"License #": "License"}, inplace = True)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290761 entries, 0 to 290760
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Inspection ID    290761 non-null  int64  
 1   DBA Name         290761 non-null  object 
 2   AKA Name         288351 non-null  object 
 3   License          290743 non-null  float64
 4   Facility Type    285536 non-null  object 
 5   Risk             290678 non-null  object 
 6   Address          290761 non-null  object 
 7   City             290601 non-null  object 
 8   State            290702 non-null  object 
 9   Zip              290720 non-null  float64
 10  Inspection Date  290761 non-null  object 
 11  Inspection Type  290760 non-null  object 
 12  Results          290761 non-null  object 
 13  Violations       209299 non-null  object 
 14  Latitude         289762 non-null  float64
 15  Longitude        289762 non-null  float64
 16  Location         289762 non-null  object 
dtypes: float64(4), int64(1), object(12)
memory usage: 37.7+ MB


null_counts = df.isnull().sum()
rows = len(df)
null_percentage = ((null_counts/rows)*100).round(2)
print(null_percentage)

Inspection ID       0.00
DBA Name            0.00
AKA Name            0.83
License             0.01
Facility Type       1.80
Risk                0.03
Address             0.00
City                0.06
State               0.02
Zip                 0.01
Inspection Date     0.00
Inspection Type     0.00
Results             0.00
Violations         28.02
Latitude            0.34
Longitude           0.34
Location            0.34
dtype: float64


null_counts

Inspection ID          0
DBA Name               0
AKA Name            2410
License               18
Facility Type       5225
Risk                  83
Address                0
City                 160
State                 59
Zip                   41
Inspection Date        0
Inspection Type        1
Results                0
Violations         81462
Latitude             999
Longitude            999
Location             999
dtype: int64


null_percentage[null_percentage>0.00].sort_values(ascending = False)

Violations       28.02
Facility Type     1.80
AKA Name          0.83
Latitude          0.34
Longitude         0.34
Location          0.34
City              0.06
Risk              0.03
State             0.02
License           0.01
Zip               0.01
dtype: float64


df= df.drop(columns=["License"])


df["Risk"] = df["Risk"].fillna("Unknown")


#Since we never have all 3 missing together, we can safely fill the missing values for each column independently without breaking logical consistency.

(df['Zip'].isnull() & df['City'].isnull() & df['State'].isnull()).sum()

0


#create a mapping table using dict
#we need state and city 
#we need to drop null values 
#drop duplicates to have unique values 
#set_index to have state as the index
#set to dict


Zip_to_city = (df[['Zip','City']].dropna().drop_duplicates().set_index('Zip')['City'].to_dict())
df['City'] = df['City'].fillna(df['Zip'].map(Zip_to_city)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City


df['City'].isnull().sum()

38


City_to_Zip = (df[['City','Zip']].dropna().drop_duplicates().set_index('City')['Zip'].to_dict())
df['Zip'] = df['Zip'].fillna(df['City'].map(City_to_Zip)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
df['Zip'].isnull().sum()

38


Zip_to_State = (df[['Zip','State']].dropna().drop_duplicates().set_index('Zip')['State'].to_dict())
df['State'] = df['State'].fillna(df['Zip'].map(Zip_to_State)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
df['State'].isnull().sum()

0


df['City']=df['City'].fillna("CHICAGO") #use Chicago fo fill in the cities

df['Zip']=df['Zip'].fillna("Unknown")


df.isnull().sum()

Inspection ID          0
DBA Name               0
AKA Name            2410
Facility Type       5225
Risk                   0
Address                0
City                   0
State                  0
Zip                    0
Inspection Date        0
Inspection Type        1
Results                0
Violations         81462
Latitude             999
Longitude            999
Location             999
dtype: int64


df[["DBA Name", "AKA Name"]]


(df['DBA Name'] == df['AKA Name']).mean()

0.727497841870127


df= df.drop(columns=["AKA Name"])


df["Facility Type"].value_counts()

Restaurant                         196349
Grocery Store                       35421
School                              18306
Children's Services Facility         6753
Bakery                               4180
                                    ...  
day spa                                 1
PROTEIN SHAKE BAR                       1
HEALTH CENTER/NUTRITION CLASSES         1
GROCERY/LIQUOR STORE                    1
Kids Cafe'                              1
Name: Facility Type, Length: 520, dtype: int64


df["Facility Type"]= df["Facility Type"].fillna("Uknown")


df[["Violations"]]


df[df['Violations'].isna()]['Results'].value_counts(normalize=True)
#This selects only the rows where Violations is missing.
#From those filtered rows, you select ONLY the Results column:

Pass                    0.454322
Out of Business         0.298593
No Entry                0.144227
Not Ready               0.047421
Fail                    0.043640
Pass w/ Conditions      0.010668
Business Not Located    0.001129
Name: Results, dtype: float64


#df.loc[row_condition, column_name] = new_value


df[(df['Violations'].isna()) & (df['Results'] == "Fail")].head(2)


df.loc[df['Violations'].isna() & (df['Results'] == "Fail"),'Violations'] = "Inspection failed but violation details are missing"


df.loc[(df['Violations'].isna()) & (df['Results'].isin(["Out of Business","No Entry","Not Ready"])), 'Violations'] = "No violation details (inspection not performed)"
df.loc[(df['Violations'].isna()) & (df['Results'] == "Pass w/ Conditions"), 'Violations'] = "Conditions present but violation details missing"

df.loc[(df['Violations'].isna()) & (df['Results'] == "Pass"), 'Violations'] = "No violation details recorded"
df.loc[(df['Violations'].isna()) & (df['Results'] == "Business Not Located"), 'Violations'] = "No violation details (business not located)"


df["Inspection Type"]= df["Inspection Type"].fillna("Uknown")


df.isnull().sum()

Inspection ID        0
DBA Name             0
Facility Type        0
Risk                 0
Address              0
City                 0
State                0
Zip                  0
Inspection Date      0
Inspection Type      0
Results              0
Violations           0
Latitude           999
Longitude          999
Location           999
dtype: int64


df.to_csv("cleaned_file.csv", index=False)

	Inspection ID	DBA Name	AKA Name	License #	Facility Type	Risk	Address	City	State	Zip	Inspection Date	Inspection Type	Results	Violations	Latitude	Longitude	Location
0	1068208	CHINA COURT RESTAURANT	CHINA COURT RESTAURANT	2141795.0	Restaurant	Risk 1 (High)	1146 N MILWAUKEE AVE	CHICAGO	IL	60642.0	03/14/2012	License Re-Inspection	Fail	18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...	41.902462	-87.665306	(41.902462266949634, -87.66530609467256)
1	1072213	CUDDLE CARE	CUDDLE CARE	1622366.0	Daycare Above and Under 2 Years	Risk 1 (High)	4800 S LAKE PARK AVE	CHICAGO	IL	60615.0	10/22/2012	Canvass	Pass	31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...	41.807922	-87.590693	(41.80792179224785, -87.5906931090992)
2	1072214	CUDDLE CARE	CUDDLE CARE	1622365.0	Daycare Above and Under 2 Years	Risk 1 (High)	4800 S LAKE PARK AVE	CHICAGO	IL	60615.0	10/22/2012	Canvass	Pass	31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...	41.807922	-87.590693	(41.80792179224785, -87.5906931090992)
3	1072228	SHARKS FISH & CHICKEN	SHARKS FISH & CHICKEN	2069562.0	Restaurant	Risk 2 (Medium)	101 E 51ST ST	CHICAGO	IL	60615.0	10/26/2012	Short Form Complaint	Pass	34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...	41.801892	-87.622566	(41.80189221533366, -87.62256558837282)
4	1072252	SALAAM RESTAURANT AND BAKERY	SALAAM RESTAURANT AND BAKERY	2141327.0	Restaurant	Risk 1 (High)	700-706 W 79TH ST	CHICAGO	IL	60620.0	01/24/2013	Canvass	Pass	33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...	41.750787	-87.641667	(41.750787498480555, -87.64166664542023)

	DBA Name	AKA Name
0	CHINA COURT RESTAURANT	CHINA COURT RESTAURANT
1	CUDDLE CARE	CUDDLE CARE
2	CUDDLE CARE	CUDDLE CARE
3	SHARKS FISH & CHICKEN	SHARKS FISH & CHICKEN
4	SALAAM RESTAURANT AND BAKERY	SALAAM RESTAURANT AND BAKERY
...	...	...
290756	MIC DUCK'S	MIC DUCK'S
290757	EL BASURERO RESTAURANT INC.	EL BASURERO RESTAURANT INC.
290758	SWEET PEA ACADEMY, INC.	SWEET PEA ACADEMY
290759	PETE'S PIZZERIA	PETE'S PIZZA #2
290760	NEW MODERN GRILL INC	NEW MODERN GRILL

	Violations
0	18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
1	31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
2	31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
3	34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...
4	33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...
...	...
290756	10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...
290757	10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...
290758	1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...
290759	47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...
290760	3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...

	Inspection ID	DBA Name	Facility Type	Risk	Address	City	State	Zip	Inspection Date	Inspection Type	Results	Violations	Latitude	Longitude	Location
456	2613257	UNICO LATIN FUSION CONTEMPORARY CUISINE	Restaurant	Risk 3 (Low)	2435 S WESTERN AVE	CHICAGO	IL	60608.0	03/06/2025	License	Fail	NaN	41.847385	-87.685363	(41.847385062646644, -87.68536343468041)
2464	1473364	THE PROMONTORY	Restaurant	Risk 3 (Low)	5307-5319 S LAKE PARK AVE	CHICAGO	IL	60615.0	06/05/2014	License	Fail	NaN	41.799318	-87.587134	(41.799317592141975, -87.58713414206618)

Import lib¶

Read File¶

Missing values Analysis¶

Note¶

1. License¶

2. Risk¶

3. zip , city , state¶

4. AKA Name¶

5. Facitity Type¶

6. Violations¶

7. Long & lag¶

8. Inspection Type¶

Quick Check¶