import numpy as np
import pandas as pd


df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/cleaned_file.csv")
df.head(5)


df.dtypes

Inspection ID        int64
DBA Name            object
Facility Type       object
Risk                object
Address             object
City                object
State               object
Zip                 object
Inspection Date     object
Inspection Type     object
Results             object
Violations          object
Latitude           float64
Longitude          float64
Location            object
dtype: object


df.isnull().sum()

Inspection ID        0
DBA Name             0
Facility Type        0
Risk                 0
Address              0
City                 0
State                0
Zip                  0
Inspection Date      0
Inspection Type      0
Results              0
Violations           0
Latitude           999
Longitude          999
Location           999
dtype: int64


df['Inspection Date'] = pd.to_datetime(df['Inspection Date'], errors='coerce')


convert_to_categories = ['Facility Type', 'Risk','City','State','Inspection Type','Results']
convert_to_string = ['DBA Name', 'Address','Violations','Location','Zip']

for x in convert_to_string:
    df[x] = df[x].astype('string')
    
for y in convert_to_categories:
    df[y] = df[y].astype('category')


df.dtypes

Inspection ID               int64
DBA Name                   string
Facility Type            category
Risk                     category
Address                    string
City                     category
State                    category
Zip                        string
Inspection Date    datetime64[ns]
Inspection Type          category
Results                  category
Violations                 string
Latitude                  float64
Longitude                 float64
Location                   string
dtype: object


df["City"].unique()

['CHICAGO', 'Chicago', 'chicago', 'ELK GROVE VILLAGE', 'CICERO', ..., 'BLUE ISLAND', 'GLENCOE', 'BROADVIEW', 'WORTH', 'Maywood']
Length: 87
Categories (87, object): ['312CHICAGO', 'ALGONQUIN', 'ALSIP', 'BANNOCKBURNDEERFIELD', ..., 'WORTH', 'alsip', 'chicago', 'chicagoBEDFORD PARK']


df['City'] = df['City'].str.strip().str.lower().str.title()
#str.strip is to remove space


df[df["Zip"].str.startswith("606")]["City"].value_counts()

Chicago                288395
Cchicago                   69
Ch                         18
Chicagochicago             11
Chicagoo                    9
Chicago.                    8
Inactive                    8
Charles A Hayes             7
312Chicago                  6
Chchicago                   6
Chcicago                    3
Chicagoi                    3
Alsip                       2
Burnham                     2
Chicagoc                    2
Chicagobedford Park         1
Name: City, dtype: int64


df.loc[df["Zip"].str.startswith("606"), "City"] = "Chicago"


df["City"]= df["City"].replace("Oolympia Fields","Olympia Fields")
df["City"]= df["City"].replace("Niles Niles","Niles")
df["City"]= df["City"].replace("Evergreen","Evergreen Park")
df["City"]= df["City"].replace("Bannockburndeerfield","Deerfield")


df["City"].unique()

array(['Chicago', 'Elk Grove Village', 'Cicero', 'Schaumburg',
       'Bridgeview', 'Berwyn', 'Ripon', 'Evanston', 'Oak Park',
       'Grayslake', 'Burbank', 'Naperville', 'Torrance', 'Plainfield',
       'Wilmette', 'Highland Park', 'Calumet City', 'Elmhurst', 'Skokie',
       'Olympia Fields', 'Justice', 'Lombard', 'Streamwood',
       'Bolingbrook', 'Maywood', 'Frankfort', 'Merriville', 'Alsip',
       'Brookfield', 'Evergreen Park', 'Merrillville', 'Matteson',
       'Hammond', 'Western Springs', 'Lake Zurich', 'Whiting', 'Summit',
       'Glen Ellyn', 'Los Angeles', 'Morton Grove', 'Oak Lawn',
       'Algonquin', 'Griffith', 'New York', 'Niles', 'New Holstein',
       'Wadsworth', 'Lansing', 'Palos Park', 'Rosemont', 'Wheaton',
       'Lake Bluff', 'Schiller Park', 'Deerfield', 'Bloomingdale',
       'Norridge', 'Chicago Heights', 'East Hazel Crest', 'Tinley Park',
       'Westmont', 'Country Club Hills', 'Des Plaines', 'Blue Island',
       'Glencoe', 'Broadview', 'Worth'], dtype=object)


df["Zip"].str.strip()

0         60642.0
1         60615.0
2         60615.0
3         60615.0
4         60620.0
           ...   
290756    60618.0
290757    60632.0
290758    60620.0
290759    60634.0
290760    60657.0
Name: Zip, Length: 290761, dtype: string


df["Zip"] = df["Zip"].str.replace(r"\.0$", "", regex=True) #match a literal . followed by a literal 0 at the end of the string


df["zip_length"] = df["Zip"].str.len()
df["zip_length"].value_counts()

5    290723
7        38
Name: zip_length, dtype: Int64


df["State_length"] = df["State"].str.len()
df["State_length"].value_counts()

2    290761
Name: State_length, dtype: int64


df["Risk"].unique()

['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', 'Unknown']
Categories (5, object): ['All', 'Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'Unknown']


df["Facility Type_cleaned"] = df["Facility Type"].str.lower().str.strip().str.title()
#replace any / that may have spaces before or after it with a clean slash (no space)
df["Facility Type_cleaned"] = df["Facility Type"].str.replace(r"\s*\/\s*","/", regex = True)


df["Inspection Type_cleaned"] = df["Inspection Type"].str.lower().str.strip()


#replace one or many white spaces into one 
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s+", " ", regex=True)


#replace any / that may have spaces before or after it with a clean slash (no space)

df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s*\/\s*", "/", regex=True)


#replce any - that may have spaces before or after it with a clean - 
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s*-\s*", "-", regex=True)


# small typo fixes
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].replace({
    "canvas": "canvass",
    "out ofbusiness": "out of business",
    "o.b.": "out of business",
    "taskforce": "task force",
    "liqour": "liquor",
    "kids cafe'": "kids cafe"
})
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace("re inspection", "re-inspection")


df.loc[df["Violations"].str.contains(r"^\D", na= False), "Violations"].unique()

<StringArray>
[                      'No violation details recorded',
     'No violation details (inspection not performed)',
 'Inspection failed but violation details are missing',
    'Conditions present but violation details missing',
         'No violation details (business not located)']
Length: 5, dtype: string

	Inspection ID	DBA Name	Facility Type	Risk	Address	City	State	Zip	Inspection Date	Inspection Type	Results	Violations	Latitude	Longitude	Location
0	1068208	CHINA COURT RESTAURANT	Restaurant	Risk 1 (High)	1146 N MILWAUKEE AVE	CHICAGO	IL	60642.0	03/14/2012	License Re-Inspection	Fail	18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...	41.902462	-87.665306	(41.902462266949634, -87.66530609467256)
1	1072213	CUDDLE CARE	Daycare Above and Under 2 Years	Risk 1 (High)	4800 S LAKE PARK AVE	CHICAGO	IL	60615.0	10/22/2012	Canvass	Pass	31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...	41.807922	-87.590693	(41.80792179224785, -87.5906931090992)
2	1072214	CUDDLE CARE	Daycare Above and Under 2 Years	Risk 1 (High)	4800 S LAKE PARK AVE	CHICAGO	IL	60615.0	10/22/2012	Canvass	Pass	31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...	41.807922	-87.590693	(41.80792179224785, -87.5906931090992)
3	1072228	SHARKS FISH & CHICKEN	Restaurant	Risk 2 (Medium)	101 E 51ST ST	CHICAGO	IL	60615.0	10/26/2012	Short Form Complaint	Pass	34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...	41.801892	-87.622566	(41.80189221533366, -87.62256558837282)
4	1072252	SALAAM RESTAURANT AND BAKERY	Restaurant	Risk 1 (High)	700-706 W 79TH ST	CHICAGO	IL	60620.0	01/24/2013	Canvass	Pass	33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...	41.750787	-87.641667	(41.750787498480555, -87.64166664542023)

DATA Types¶

Cleaning inconsistent Data¶

1. City¶

2. Zip¶

3. Risk¶

4. Facility Type¶

5. Inspection Type¶

6. violations¶