Import lib¶

In [33]:
import numpy as np
import pandas as pd

Read File¶

In [34]:
df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/Food_Inspections_20250628.csv", engine ="python", on_bad_lines="skip")
df.head(5)
Out[34]:
Inspection ID DBA Name AKA Name License # Facility Type Risk Address City State Zip Inspection Date Inspection Type Results Violations Latitude Longitude Location
0 1068208 CHINA COURT RESTAURANT CHINA COURT RESTAURANT 2141795.0 Restaurant Risk 1 (High) 1146 N MILWAUKEE AVE CHICAGO IL 60642.0 03/14/2012 License Re-Inspection Fail 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN... 41.902462 -87.665306 (41.902462266949634, -87.66530609467256)
1 1072213 CUDDLE CARE CUDDLE CARE 1622366.0 Daycare Above and Under 2 Years Risk 1 (High) 4800 S LAKE PARK AVE CHICAGO IL 60615.0 10/22/2012 Canvass Pass 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... 41.807922 -87.590693 (41.80792179224785, -87.5906931090992)
2 1072214 CUDDLE CARE CUDDLE CARE 1622365.0 Daycare Above and Under 2 Years Risk 1 (High) 4800 S LAKE PARK AVE CHICAGO IL 60615.0 10/22/2012 Canvass Pass 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... 41.807922 -87.590693 (41.80792179224785, -87.5906931090992)
3 1072228 SHARKS FISH & CHICKEN SHARKS FISH & CHICKEN 2069562.0 Restaurant Risk 2 (Medium) 101 E 51ST ST CHICAGO IL 60615.0 10/26/2012 Short Form Complaint Pass 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO... 41.801892 -87.622566 (41.80189221533366, -87.62256558837282)
4 1072252 SALAAM RESTAURANT AND BAKERY SALAAM RESTAURANT AND BAKERY 2141327.0 Restaurant Risk 1 (High) 700-706 W 79TH ST CHICAGO IL 60620.0 01/24/2013 Canvass Pass 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI... 41.750787 -87.641667 (41.750787498480555, -87.64166664542023)
In [35]:
df.rename(columns={"License #": "License"}, inplace = True)
In [36]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290761 entries, 0 to 290760
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Inspection ID    290761 non-null  int64  
 1   DBA Name         290761 non-null  object 
 2   AKA Name         288351 non-null  object 
 3   License          290743 non-null  float64
 4   Facility Type    285536 non-null  object 
 5   Risk             290678 non-null  object 
 6   Address          290761 non-null  object 
 7   City             290601 non-null  object 
 8   State            290702 non-null  object 
 9   Zip              290720 non-null  float64
 10  Inspection Date  290761 non-null  object 
 11  Inspection Type  290760 non-null  object 
 12  Results          290761 non-null  object 
 13  Violations       209299 non-null  object 
 14  Latitude         289762 non-null  float64
 15  Longitude        289762 non-null  float64
 16  Location         289762 non-null  object 
dtypes: float64(4), int64(1), object(12)
memory usage: 37.7+ MB

Missing values Analysis¶

In [37]:
null_counts = df.isnull().sum()
rows = len(df)
null_percentage = ((null_counts/rows)*100).round(2)
print(null_percentage)
Inspection ID       0.00
DBA Name            0.00
AKA Name            0.83
License             0.01
Facility Type       1.80
Risk                0.03
Address             0.00
City                0.06
State               0.02
Zip                 0.01
Inspection Date     0.00
Inspection Type     0.00
Results             0.00
Violations         28.02
Latitude            0.34
Longitude           0.34
Location            0.34
dtype: float64
In [38]:
null_counts
Out[38]:
Inspection ID          0
DBA Name               0
AKA Name            2410
License               18
Facility Type       5225
Risk                  83
Address                0
City                 160
State                 59
Zip                   41
Inspection Date        0
Inspection Type        1
Results                0
Violations         81462
Latitude             999
Longitude            999
Location             999
dtype: int64
In [39]:
null_percentage[null_percentage>0.00].sort_values(ascending = False)
Out[39]:
Violations       28.02
Facility Type     1.80
AKA Name          0.83
Latitude          0.34
Longitude         0.34
Location          0.34
City              0.06
Risk              0.03
State             0.02
License           0.01
Zip               0.01
dtype: float64

Note¶

Strategy to deal with nan values

  • license => drop
  • Risk => fill in
  • zip , city , state => fill in
  • AKA name => drop
  • Facility Type => fill in
  • Violations => Nan could mean no violation has occured => to be checked

1. License¶

In [40]:
df= df.drop(columns=["License"])

2. Risk¶

In [41]:
df["Risk"] = df["Risk"].fillna("Unknown")

3. zip , city , state¶

In [42]:
#Since we never have all 3 missing together, we can safely fill the missing values for each column independently without breaking logical consistency.

(df['Zip'].isnull() & df['City'].isnull() & df['State'].isnull()).sum()
Out[42]:
0
In [43]:
#create a mapping table using dict
#we need state and city 
#we need to drop null values 
#drop duplicates to have unique values 
#set_index to have state as the index
#set to dict
In [44]:
Zip_to_city = (df[['Zip','City']].dropna().drop_duplicates().set_index('Zip')['City'].to_dict())
df['City'] = df['City'].fillna(df['Zip'].map(Zip_to_city)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
In [45]:
df['City'].isnull().sum()
Out[45]:
38
In [46]:
City_to_Zip = (df[['City','Zip']].dropna().drop_duplicates().set_index('City')['Zip'].to_dict())
df['Zip'] = df['Zip'].fillna(df['City'].map(City_to_Zip)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
df['Zip'].isnull().sum()
Out[46]:
38
In [47]:
Zip_to_State = (df[['Zip','State']].dropna().drop_duplicates().set_index('Zip')['State'].to_dict())
df['State'] = df['State'].fillna(df['Zip'].map(Zip_to_State)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
df['State'].isnull().sum()
Out[47]:
0
In [48]:
df['City']=df['City'].fillna("CHICAGO") #use Chicago fo fill in the cities

df['Zip']=df['Zip'].fillna("Unknown")
In [49]:
df.isnull().sum()
Out[49]:
Inspection ID          0
DBA Name               0
AKA Name            2410
Facility Type       5225
Risk                   0
Address                0
City                   0
State                  0
Zip                    0
Inspection Date        0
Inspection Type        1
Results                0
Violations         81462
Latitude             999
Longitude            999
Location             999
dtype: int64

4. AKA Name¶

In [50]:
df[["DBA Name", "AKA Name"]]
Out[50]:
DBA Name AKA Name
0 CHINA COURT RESTAURANT CHINA COURT RESTAURANT
1 CUDDLE CARE CUDDLE CARE
2 CUDDLE CARE CUDDLE CARE
3 SHARKS FISH & CHICKEN SHARKS FISH & CHICKEN
4 SALAAM RESTAURANT AND BAKERY SALAAM RESTAURANT AND BAKERY
... ... ...
290756 MIC DUCK'S MIC DUCK'S
290757 EL BASURERO RESTAURANT INC. EL BASURERO RESTAURANT INC.
290758 SWEET PEA ACADEMY, INC. SWEET PEA ACADEMY
290759 PETE'S PIZZERIA PETE'S PIZZA #2
290760 NEW MODERN GRILL INC NEW MODERN GRILL

290761 rows × 2 columns

In [51]:
(df['DBA Name'] == df['AKA Name']).mean()
Out[51]:
0.727497841870127

This means:

In 73% of all rows, AKA Name is EXACTLY the same as DBA Name.

In the remaining 27% of rows, AKA Name is:

  • different

  • OR null

So AKA Name adds very little new information.

In [52]:
df= df.drop(columns=["AKA Name"])

5. Facitity Type¶

In [53]:
df["Facility Type"].value_counts()
Out[53]:
Restaurant                         196349
Grocery Store                       35421
School                              18306
Children's Services Facility         6753
Bakery                               4180
                                    ...  
day spa                                 1
PROTEIN SHAKE BAR                       1
HEALTH CENTER/NUTRITION CLASSES         1
GROCERY/LIQUOR STORE                    1
Kids Cafe'                              1
Name: Facility Type, Length: 520, dtype: int64
In [54]:
df["Facility Type"]= df["Facility Type"].fillna("Uknown")

6. Violations¶

In [55]:
df[["Violations"]]
Out[55]:
Violations
0 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
1 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
2 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
3 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...
4 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...
... ...
290756 10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...
290757 10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...
290758 1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...
290759 47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...
290760 3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...

290761 rows × 1 columns

In [56]:
df[df['Violations'].isna()]['Results'].value_counts(normalize=True)
#This selects only the rows where Violations is missing.
#From those filtered rows, you select ONLY the Results column:
Out[56]:
Pass                    0.454322
Out of Business         0.298593
No Entry                0.144227
Not Ready               0.047421
Fail                    0.043640
Pass w/ Conditions      0.010668
Business Not Located    0.001129
Name: Results, dtype: float64
In [57]:
#df.loc[row_condition, column_name] = new_value
In [58]:
df[(df['Violations'].isna()) & (df['Results'] == "Fail")].head(2)
Out[58]:
Inspection ID DBA Name Facility Type Risk Address City State Zip Inspection Date Inspection Type Results Violations Latitude Longitude Location
456 2613257 UNICO LATIN FUSION CONTEMPORARY CUISINE Restaurant Risk 3 (Low) 2435 S WESTERN AVE CHICAGO IL 60608.0 03/06/2025 License Fail NaN 41.847385 -87.685363 (41.847385062646644, -87.68536343468041)
2464 1473364 THE PROMONTORY Restaurant Risk 3 (Low) 5307-5319 S LAKE PARK AVE CHICAGO IL 60615.0 06/05/2014 License Fail NaN 41.799318 -87.587134 (41.799317592141975, -87.58713414206618)
In [59]:
df.loc[df['Violations'].isna() & (df['Results'] == "Fail"),'Violations'] = "Inspection failed but violation details are missing"
In [60]:
df.loc[(df['Violations'].isna()) & (df['Results'].isin(["Out of Business","No Entry","Not Ready"])), 'Violations'] = "No violation details (inspection not performed)"
df.loc[(df['Violations'].isna()) & (df['Results'] == "Pass w/ Conditions"), 'Violations'] = "Conditions present but violation details missing"

df.loc[(df['Violations'].isna()) & (df['Results'] == "Pass"), 'Violations'] = "No violation details recorded"
df.loc[(df['Violations'].isna()) & (df['Results'] == "Business Not Located"), 'Violations'] = "No violation details (business not located)"

7. Long & lag¶

keeping the null values as they are for the moment

8. Inspection Type¶

In [61]:
df["Inspection Type"]= df["Inspection Type"].fillna("Uknown")

Quick Check¶

In [62]:
df.isnull().sum()
Out[62]:
Inspection ID        0
DBA Name             0
Facility Type        0
Risk                 0
Address              0
City                 0
State                0
Zip                  0
Inspection Date      0
Inspection Type      0
Results              0
Violations           0
Latitude           999
Longitude          999
Location           999
dtype: int64
In [65]:
df.to_csv("cleaned_file.csv", index=False)