import numpy as np
import pandas as pd
df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/cleaned_file.csv")
df.head(5)
| Inspection ID | DBA Name | Facility Type | Risk | Address | City | State | Zip | Inspection Date | Inspection Type | Results | Violations | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1068208 | CHINA COURT RESTAURANT | Restaurant | Risk 1 (High) | 1146 N MILWAUKEE AVE | CHICAGO | IL | 60642.0 | 03/14/2012 | License Re-Inspection | Fail | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN... | 41.902462 | -87.665306 | (41.902462266949634, -87.66530609467256) |
| 1 | 1072213 | CUDDLE CARE | Daycare Above and Under 2 Years | Risk 1 (High) | 4800 S LAKE PARK AVE | CHICAGO | IL | 60615.0 | 10/22/2012 | Canvass | Pass | 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... | 41.807922 | -87.590693 | (41.80792179224785, -87.5906931090992) |
| 2 | 1072214 | CUDDLE CARE | Daycare Above and Under 2 Years | Risk 1 (High) | 4800 S LAKE PARK AVE | CHICAGO | IL | 60615.0 | 10/22/2012 | Canvass | Pass | 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... | 41.807922 | -87.590693 | (41.80792179224785, -87.5906931090992) |
| 3 | 1072228 | SHARKS FISH & CHICKEN | Restaurant | Risk 2 (Medium) | 101 E 51ST ST | CHICAGO | IL | 60615.0 | 10/26/2012 | Short Form Complaint | Pass | 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO... | 41.801892 | -87.622566 | (41.80189221533366, -87.62256558837282) |
| 4 | 1072252 | SALAAM RESTAURANT AND BAKERY | Restaurant | Risk 1 (High) | 700-706 W 79TH ST | CHICAGO | IL | 60620.0 | 01/24/2013 | Canvass | Pass | 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI... | 41.750787 | -87.641667 | (41.750787498480555, -87.64166664542023) |
df.dtypes
Inspection ID int64 DBA Name object Facility Type object Risk object Address object City object State object Zip object Inspection Date object Inspection Type object Results object Violations object Latitude float64 Longitude float64 Location object dtype: object
df.isnull().sum()
Inspection ID 0 DBA Name 0 Facility Type 0 Risk 0 Address 0 City 0 State 0 Zip 0 Inspection Date 0 Inspection Type 0 Results 0 Violations 0 Latitude 999 Longitude 999 Location 999 dtype: int64
df['Inspection Date'] = pd.to_datetime(df['Inspection Date'], errors='coerce')
convert_to_categories = ['Facility Type', 'Risk','City','State','Inspection Type','Results']
convert_to_string = ['DBA Name', 'Address','Violations','Location','Zip']
for x in convert_to_string:
df[x] = df[x].astype('string')
for y in convert_to_categories:
df[y] = df[y].astype('category')
df.dtypes
Inspection ID int64 DBA Name string Facility Type category Risk category Address string City category State category Zip string Inspection Date datetime64[ns] Inspection Type category Results category Violations string Latitude float64 Longitude float64 Location string dtype: object
df["City"].unique()
['CHICAGO', 'Chicago', 'chicago', 'ELK GROVE VILLAGE', 'CICERO', ..., 'BLUE ISLAND', 'GLENCOE', 'BROADVIEW', 'WORTH', 'Maywood'] Length: 87 Categories (87, object): ['312CHICAGO', 'ALGONQUIN', 'ALSIP', 'BANNOCKBURNDEERFIELD', ..., 'WORTH', 'alsip', 'chicago', 'chicagoBEDFORD PARK']
df['City'] = df['City'].str.strip().str.lower().str.title()
#str.strip is to remove space
df[df["Zip"].str.startswith("606")]["City"].value_counts()
Chicago 288395 Cchicago 69 Ch 18 Chicagochicago 11 Chicagoo 9 Chicago. 8 Inactive 8 Charles A Hayes 7 312Chicago 6 Chchicago 6 Chcicago 3 Chicagoi 3 Alsip 2 Burnham 2 Chicagoc 2 Chicagobedford Park 1 Name: City, dtype: int64
df.loc[df["Zip"].str.startswith("606"), "City"] = "Chicago"
df["City"]= df["City"].replace("Oolympia Fields","Olympia Fields")
df["City"]= df["City"].replace("Niles Niles","Niles")
df["City"]= df["City"].replace("Evergreen","Evergreen Park")
df["City"]= df["City"].replace("Bannockburndeerfield","Deerfield")
df["City"].unique()
array(['Chicago', 'Elk Grove Village', 'Cicero', 'Schaumburg',
'Bridgeview', 'Berwyn', 'Ripon', 'Evanston', 'Oak Park',
'Grayslake', 'Burbank', 'Naperville', 'Torrance', 'Plainfield',
'Wilmette', 'Highland Park', 'Calumet City', 'Elmhurst', 'Skokie',
'Olympia Fields', 'Justice', 'Lombard', 'Streamwood',
'Bolingbrook', 'Maywood', 'Frankfort', 'Merriville', 'Alsip',
'Brookfield', 'Evergreen Park', 'Merrillville', 'Matteson',
'Hammond', 'Western Springs', 'Lake Zurich', 'Whiting', 'Summit',
'Glen Ellyn', 'Los Angeles', 'Morton Grove', 'Oak Lawn',
'Algonquin', 'Griffith', 'New York', 'Niles', 'New Holstein',
'Wadsworth', 'Lansing', 'Palos Park', 'Rosemont', 'Wheaton',
'Lake Bluff', 'Schiller Park', 'Deerfield', 'Bloomingdale',
'Norridge', 'Chicago Heights', 'East Hazel Crest', 'Tinley Park',
'Westmont', 'Country Club Hills', 'Des Plaines', 'Blue Island',
'Glencoe', 'Broadview', 'Worth'], dtype=object)
df["Zip"].str.strip()
0 60642.0
1 60615.0
2 60615.0
3 60615.0
4 60620.0
...
290756 60618.0
290757 60632.0
290758 60620.0
290759 60634.0
290760 60657.0
Name: Zip, Length: 290761, dtype: string
df["Zip"] = df["Zip"].str.replace(r"\.0$", "", regex=True) #match a literal . followed by a literal 0 at the end of the string
df["zip_length"] = df["Zip"].str.len()
df["zip_length"].value_counts()
5 290723 7 38 Name: zip_length, dtype: Int64
df["State_length"] = df["State"].str.len()
df["State_length"].value_counts()
2 290761 Name: State_length, dtype: int64
df["Risk"].unique()
['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', 'Unknown'] Categories (5, object): ['All', 'Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'Unknown']
df["Facility Type_cleaned"] = df["Facility Type"].str.lower().str.strip().str.title()
#replace any / that may have spaces before or after it with a clean slash (no space)
df["Facility Type_cleaned"] = df["Facility Type"].str.replace(r"\s*\/\s*","/", regex = True)
df["Inspection Type_cleaned"] = df["Inspection Type"].str.lower().str.strip()
#replace one or many white spaces into one
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s+", " ", regex=True)
#replace any / that may have spaces before or after it with a clean slash (no space)
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s*\/\s*", "/", regex=True)
#replce any - that may have spaces before or after it with a clean -
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s*-\s*", "-", regex=True)
# small typo fixes
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].replace({
"canvas": "canvass",
"out ofbusiness": "out of business",
"o.b.": "out of business",
"taskforce": "task force",
"liqour": "liquor",
"kids cafe'": "kids cafe"
})
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace("re inspection", "re-inspection")
df.loc[df["Violations"].str.contains(r"^\D", na= False), "Violations"].unique()
<StringArray>
[ 'No violation details recorded',
'No violation details (inspection not performed)',
'Inspection failed but violation details are missing',
'Conditions present but violation details missing',
'No violation details (business not located)']
Length: 5, dtype: string