import numpy as np
import pandas as pd
df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/Food_Inspections_20250628.csv", engine ="python", on_bad_lines="skip")
df.head(5)
| Inspection ID | DBA Name | AKA Name | License # | Facility Type | Risk | Address | City | State | Zip | Inspection Date | Inspection Type | Results | Violations | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1068208 | CHINA COURT RESTAURANT | CHINA COURT RESTAURANT | 2141795.0 | Restaurant | Risk 1 (High) | 1146 N MILWAUKEE AVE | CHICAGO | IL | 60642.0 | 03/14/2012 | License Re-Inspection | Fail | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN... | 41.902462 | -87.665306 | (41.902462266949634, -87.66530609467256) |
| 1 | 1072213 | CUDDLE CARE | CUDDLE CARE | 1622366.0 | Daycare Above and Under 2 Years | Risk 1 (High) | 4800 S LAKE PARK AVE | CHICAGO | IL | 60615.0 | 10/22/2012 | Canvass | Pass | 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... | 41.807922 | -87.590693 | (41.80792179224785, -87.5906931090992) |
| 2 | 1072214 | CUDDLE CARE | CUDDLE CARE | 1622365.0 | Daycare Above and Under 2 Years | Risk 1 (High) | 4800 S LAKE PARK AVE | CHICAGO | IL | 60615.0 | 10/22/2012 | Canvass | Pass | 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... | 41.807922 | -87.590693 | (41.80792179224785, -87.5906931090992) |
| 3 | 1072228 | SHARKS FISH & CHICKEN | SHARKS FISH & CHICKEN | 2069562.0 | Restaurant | Risk 2 (Medium) | 101 E 51ST ST | CHICAGO | IL | 60615.0 | 10/26/2012 | Short Form Complaint | Pass | 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO... | 41.801892 | -87.622566 | (41.80189221533366, -87.62256558837282) |
| 4 | 1072252 | SALAAM RESTAURANT AND BAKERY | SALAAM RESTAURANT AND BAKERY | 2141327.0 | Restaurant | Risk 1 (High) | 700-706 W 79TH ST | CHICAGO | IL | 60620.0 | 01/24/2013 | Canvass | Pass | 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI... | 41.750787 | -87.641667 | (41.750787498480555, -87.64166664542023) |
df.rename(columns={"License #": "License"}, inplace = True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 290761 entries, 0 to 290760 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Inspection ID 290761 non-null int64 1 DBA Name 290761 non-null object 2 AKA Name 288351 non-null object 3 License 290743 non-null float64 4 Facility Type 285536 non-null object 5 Risk 290678 non-null object 6 Address 290761 non-null object 7 City 290601 non-null object 8 State 290702 non-null object 9 Zip 290720 non-null float64 10 Inspection Date 290761 non-null object 11 Inspection Type 290760 non-null object 12 Results 290761 non-null object 13 Violations 209299 non-null object 14 Latitude 289762 non-null float64 15 Longitude 289762 non-null float64 16 Location 289762 non-null object dtypes: float64(4), int64(1), object(12) memory usage: 37.7+ MB
null_counts = df.isnull().sum()
rows = len(df)
null_percentage = ((null_counts/rows)*100).round(2)
print(null_percentage)
Inspection ID 0.00 DBA Name 0.00 AKA Name 0.83 License 0.01 Facility Type 1.80 Risk 0.03 Address 0.00 City 0.06 State 0.02 Zip 0.01 Inspection Date 0.00 Inspection Type 0.00 Results 0.00 Violations 28.02 Latitude 0.34 Longitude 0.34 Location 0.34 dtype: float64
null_counts
Inspection ID 0 DBA Name 0 AKA Name 2410 License 18 Facility Type 5225 Risk 83 Address 0 City 160 State 59 Zip 41 Inspection Date 0 Inspection Type 1 Results 0 Violations 81462 Latitude 999 Longitude 999 Location 999 dtype: int64
null_percentage[null_percentage>0.00].sort_values(ascending = False)
Violations 28.02 Facility Type 1.80 AKA Name 0.83 Latitude 0.34 Longitude 0.34 Location 0.34 City 0.06 Risk 0.03 State 0.02 License 0.01 Zip 0.01 dtype: float64
Strategy to deal with nan values
df= df.drop(columns=["License"])
df["Risk"] = df["Risk"].fillna("Unknown")
#Since we never have all 3 missing together, we can safely fill the missing values for each column independently without breaking logical consistency.
(df['Zip'].isnull() & df['City'].isnull() & df['State'].isnull()).sum()
0
#create a mapping table using dict
#we need state and city
#we need to drop null values
#drop duplicates to have unique values
#set_index to have state as the index
#set to dict
Zip_to_city = (df[['Zip','City']].dropna().drop_duplicates().set_index('Zip')['City'].to_dict())
df['City'] = df['City'].fillna(df['Zip'].map(Zip_to_city)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
df['City'].isnull().sum()
38
City_to_Zip = (df[['City','Zip']].dropna().drop_duplicates().set_index('City')['Zip'].to_dict())
df['Zip'] = df['Zip'].fillna(df['City'].map(City_to_Zip)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
df['Zip'].isnull().sum()
38
Zip_to_State = (df[['Zip','State']].dropna().drop_duplicates().set_index('Zip')['State'].to_dict())
df['State'] = df['State'].fillna(df['Zip'].map(Zip_to_State)) #df['Zip'].map(Zip_to_city) replaces ZIP codes with their City
df['State'].isnull().sum()
0
df['City']=df['City'].fillna("CHICAGO") #use Chicago fo fill in the cities
df['Zip']=df['Zip'].fillna("Unknown")
df.isnull().sum()
Inspection ID 0 DBA Name 0 AKA Name 2410 Facility Type 5225 Risk 0 Address 0 City 0 State 0 Zip 0 Inspection Date 0 Inspection Type 1 Results 0 Violations 81462 Latitude 999 Longitude 999 Location 999 dtype: int64
df[["DBA Name", "AKA Name"]]
| DBA Name | AKA Name | |
|---|---|---|
| 0 | CHINA COURT RESTAURANT | CHINA COURT RESTAURANT |
| 1 | CUDDLE CARE | CUDDLE CARE |
| 2 | CUDDLE CARE | CUDDLE CARE |
| 3 | SHARKS FISH & CHICKEN | SHARKS FISH & CHICKEN |
| 4 | SALAAM RESTAURANT AND BAKERY | SALAAM RESTAURANT AND BAKERY |
| ... | ... | ... |
| 290756 | MIC DUCK'S | MIC DUCK'S |
| 290757 | EL BASURERO RESTAURANT INC. | EL BASURERO RESTAURANT INC. |
| 290758 | SWEET PEA ACADEMY, INC. | SWEET PEA ACADEMY |
| 290759 | PETE'S PIZZERIA | PETE'S PIZZA #2 |
| 290760 | NEW MODERN GRILL INC | NEW MODERN GRILL |
290761 rows × 2 columns
(df['DBA Name'] == df['AKA Name']).mean()
0.727497841870127
This means:
In 73% of all rows, AKA Name is EXACTLY the same as DBA Name.
In the remaining 27% of rows, AKA Name is:
different
OR null
So AKA Name adds very little new information.
df= df.drop(columns=["AKA Name"])
df["Facility Type"].value_counts()
Restaurant 196349
Grocery Store 35421
School 18306
Children's Services Facility 6753
Bakery 4180
...
day spa 1
PROTEIN SHAKE BAR 1
HEALTH CENTER/NUTRITION CLASSES 1
GROCERY/LIQUOR STORE 1
Kids Cafe' 1
Name: Facility Type, Length: 520, dtype: int64
df["Facility Type"]= df["Facility Type"].fillna("Uknown")
df[["Violations"]]
| Violations | |
|---|---|
| 0 | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN... |
| 1 | 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... |
| 2 | 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... |
| 3 | 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO... |
| 4 | 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI... |
| ... | ... |
| 290756 | 10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI... |
| 290757 | 10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI... |
| 290758 | 1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW... |
| 290759 | 47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE... |
| 290760 | 3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E... |
290761 rows × 1 columns
df[df['Violations'].isna()]['Results'].value_counts(normalize=True)
#This selects only the rows where Violations is missing.
#From those filtered rows, you select ONLY the Results column:
Pass 0.454322 Out of Business 0.298593 No Entry 0.144227 Not Ready 0.047421 Fail 0.043640 Pass w/ Conditions 0.010668 Business Not Located 0.001129 Name: Results, dtype: float64
#df.loc[row_condition, column_name] = new_value
df[(df['Violations'].isna()) & (df['Results'] == "Fail")].head(2)
| Inspection ID | DBA Name | Facility Type | Risk | Address | City | State | Zip | Inspection Date | Inspection Type | Results | Violations | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 456 | 2613257 | UNICO LATIN FUSION CONTEMPORARY CUISINE | Restaurant | Risk 3 (Low) | 2435 S WESTERN AVE | CHICAGO | IL | 60608.0 | 03/06/2025 | License | Fail | NaN | 41.847385 | -87.685363 | (41.847385062646644, -87.68536343468041) |
| 2464 | 1473364 | THE PROMONTORY | Restaurant | Risk 3 (Low) | 5307-5319 S LAKE PARK AVE | CHICAGO | IL | 60615.0 | 06/05/2014 | License | Fail | NaN | 41.799318 | -87.587134 | (41.799317592141975, -87.58713414206618) |
df.loc[df['Violations'].isna() & (df['Results'] == "Fail"),'Violations'] = "Inspection failed but violation details are missing"
df.loc[(df['Violations'].isna()) & (df['Results'].isin(["Out of Business","No Entry","Not Ready"])), 'Violations'] = "No violation details (inspection not performed)"
df.loc[(df['Violations'].isna()) & (df['Results'] == "Pass w/ Conditions"), 'Violations'] = "Conditions present but violation details missing"
df.loc[(df['Violations'].isna()) & (df['Results'] == "Pass"), 'Violations'] = "No violation details recorded"
df.loc[(df['Violations'].isna()) & (df['Results'] == "Business Not Located"), 'Violations'] = "No violation details (business not located)"
keeping the null values as they are for the moment
df["Inspection Type"]= df["Inspection Type"].fillna("Uknown")
df.isnull().sum()
Inspection ID 0 DBA Name 0 Facility Type 0 Risk 0 Address 0 City 0 State 0 Zip 0 Inspection Date 0 Inspection Type 0 Results 0 Violations 0 Latitude 999 Longitude 999 Location 999 dtype: int64
df.to_csv("cleaned_file.csv", index=False)