DATA Types¶

In [97]:
import numpy as np
import pandas as pd
In [98]:
df= pd.read_csv("/home/jovyan/MonDossier/python 2026/datasets/cleaned_file.csv")
df.head(5)
Out[98]:
Inspection ID DBA Name Facility Type Risk Address City State Zip Inspection Date Inspection Type Results Violations Latitude Longitude Location
0 1068208 CHINA COURT RESTAURANT Restaurant Risk 1 (High) 1146 N MILWAUKEE AVE CHICAGO IL 60642.0 03/14/2012 License Re-Inspection Fail 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN... 41.902462 -87.665306 (41.902462266949634, -87.66530609467256)
1 1072213 CUDDLE CARE Daycare Above and Under 2 Years Risk 1 (High) 4800 S LAKE PARK AVE CHICAGO IL 60615.0 10/22/2012 Canvass Pass 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... 41.807922 -87.590693 (41.80792179224785, -87.5906931090992)
2 1072214 CUDDLE CARE Daycare Above and Under 2 Years Risk 1 (High) 4800 S LAKE PARK AVE CHICAGO IL 60615.0 10/22/2012 Canvass Pass 31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC... 41.807922 -87.590693 (41.80792179224785, -87.5906931090992)
3 1072228 SHARKS FISH & CHICKEN Restaurant Risk 2 (Medium) 101 E 51ST ST CHICAGO IL 60615.0 10/26/2012 Short Form Complaint Pass 34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO... 41.801892 -87.622566 (41.80189221533366, -87.62256558837282)
4 1072252 SALAAM RESTAURANT AND BAKERY Restaurant Risk 1 (High) 700-706 W 79TH ST CHICAGO IL 60620.0 01/24/2013 Canvass Pass 33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI... 41.750787 -87.641667 (41.750787498480555, -87.64166664542023)
In [99]:
df.dtypes
Out[99]:
Inspection ID        int64
DBA Name            object
Facility Type       object
Risk                object
Address             object
City                object
State               object
Zip                 object
Inspection Date     object
Inspection Type     object
Results             object
Violations          object
Latitude           float64
Longitude          float64
Location            object
dtype: object
In [100]:
df.isnull().sum()
Out[100]:
Inspection ID        0
DBA Name             0
Facility Type        0
Risk                 0
Address              0
City                 0
State                0
Zip                  0
Inspection Date      0
Inspection Type      0
Results              0
Violations           0
Latitude           999
Longitude          999
Location           999
dtype: int64
In [101]:
df['Inspection Date'] = pd.to_datetime(df['Inspection Date'], errors='coerce')
In [102]:
convert_to_categories = ['Facility Type', 'Risk','City','State','Inspection Type','Results']
convert_to_string = ['DBA Name', 'Address','Violations','Location','Zip']

for x in convert_to_string:
    df[x] = df[x].astype('string')
    
for y in convert_to_categories:
    df[y] = df[y].astype('category')
In [103]:
df.dtypes
Out[103]:
Inspection ID               int64
DBA Name                   string
Facility Type            category
Risk                     category
Address                    string
City                     category
State                    category
Zip                        string
Inspection Date    datetime64[ns]
Inspection Type          category
Results                  category
Violations                 string
Latitude                  float64
Longitude                 float64
Location                   string
dtype: object

Cleaning inconsistent Data¶

1. City¶

In [104]:
df["City"].unique()
Out[104]:
['CHICAGO', 'Chicago', 'chicago', 'ELK GROVE VILLAGE', 'CICERO', ..., 'BLUE ISLAND', 'GLENCOE', 'BROADVIEW', 'WORTH', 'Maywood']
Length: 87
Categories (87, object): ['312CHICAGO', 'ALGONQUIN', 'ALSIP', 'BANNOCKBURNDEERFIELD', ..., 'WORTH', 'alsip', 'chicago', 'chicagoBEDFORD PARK']
In [105]:
df['City'] = df['City'].str.strip().str.lower().str.title()
#str.strip is to remove space
In [106]:
df[df["Zip"].str.startswith("606")]["City"].value_counts()
Out[106]:
Chicago                288395
Cchicago                   69
Ch                         18
Chicagochicago             11
Chicagoo                    9
Chicago.                    8
Inactive                    8
Charles A Hayes             7
312Chicago                  6
Chchicago                   6
Chcicago                    3
Chicagoi                    3
Alsip                       2
Burnham                     2
Chicagoc                    2
Chicagobedford Park         1
Name: City, dtype: int64
In [107]:
df.loc[df["Zip"].str.startswith("606"), "City"] = "Chicago"
In [108]:
df["City"]= df["City"].replace("Oolympia Fields","Olympia Fields")
df["City"]= df["City"].replace("Niles Niles","Niles")
df["City"]= df["City"].replace("Evergreen","Evergreen Park")
df["City"]= df["City"].replace("Bannockburndeerfield","Deerfield")
In [109]:
df["City"].unique()
Out[109]:
array(['Chicago', 'Elk Grove Village', 'Cicero', 'Schaumburg',
       'Bridgeview', 'Berwyn', 'Ripon', 'Evanston', 'Oak Park',
       'Grayslake', 'Burbank', 'Naperville', 'Torrance', 'Plainfield',
       'Wilmette', 'Highland Park', 'Calumet City', 'Elmhurst', 'Skokie',
       'Olympia Fields', 'Justice', 'Lombard', 'Streamwood',
       'Bolingbrook', 'Maywood', 'Frankfort', 'Merriville', 'Alsip',
       'Brookfield', 'Evergreen Park', 'Merrillville', 'Matteson',
       'Hammond', 'Western Springs', 'Lake Zurich', 'Whiting', 'Summit',
       'Glen Ellyn', 'Los Angeles', 'Morton Grove', 'Oak Lawn',
       'Algonquin', 'Griffith', 'New York', 'Niles', 'New Holstein',
       'Wadsworth', 'Lansing', 'Palos Park', 'Rosemont', 'Wheaton',
       'Lake Bluff', 'Schiller Park', 'Deerfield', 'Bloomingdale',
       'Norridge', 'Chicago Heights', 'East Hazel Crest', 'Tinley Park',
       'Westmont', 'Country Club Hills', 'Des Plaines', 'Blue Island',
       'Glencoe', 'Broadview', 'Worth'], dtype=object)

2. Zip¶

In [110]:
df["Zip"].str.strip()
Out[110]:
0         60642.0
1         60615.0
2         60615.0
3         60615.0
4         60620.0
           ...   
290756    60618.0
290757    60632.0
290758    60620.0
290759    60634.0
290760    60657.0
Name: Zip, Length: 290761, dtype: string
In [111]:
df["Zip"] = df["Zip"].str.replace(r"\.0$", "", regex=True) #match a literal . followed by a literal 0 at the end of the string
In [112]:
df["zip_length"] = df["Zip"].str.len()
df["zip_length"].value_counts()
Out[112]:
5    290723
7        38
Name: zip_length, dtype: Int64
In [113]:
df["State_length"] = df["State"].str.len()
df["State_length"].value_counts()
Out[113]:
2    290761
Name: State_length, dtype: int64

3. Risk¶

In [114]:
df["Risk"].unique()
Out[114]:
['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', 'Unknown']
Categories (5, object): ['All', 'Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'Unknown']

4. Facility Type¶

In [125]:
df["Facility Type_cleaned"] = df["Facility Type"].str.lower().str.strip().str.title()
#replace any / that may have spaces before or after it with a clean slash (no space)
df["Facility Type_cleaned"] = df["Facility Type"].str.replace(r"\s*\/\s*","/", regex = True) 

5. Inspection Type¶

In [116]:
df["Inspection Type_cleaned"] = df["Inspection Type"].str.lower().str.strip()
In [117]:
#replace one or many white spaces into one 
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s+", " ", regex=True)
In [118]:
#replace any / that may have spaces before or after it with a clean slash (no space)

df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s*\/\s*", "/", regex=True)
In [119]:
#replce any - that may have spaces before or after it with a clean - 
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace(r"\s*-\s*", "-", regex=True)
In [120]:
# small typo fixes
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].replace({
    "canvas": "canvass",
    "out ofbusiness": "out of business",
    "o.b.": "out of business",
    "taskforce": "task force",
    "liqour": "liquor",
    "kids cafe'": "kids cafe"
})
df["Inspection Type_cleaned"] = df["Inspection Type_cleaned"].str.replace("re inspection", "re-inspection")

6. violations¶

In [137]:
df.loc[df["Violations"].str.contains(r"^\D", na= False), "Violations"].unique()
Out[137]:
<StringArray>
[                      'No violation details recorded',
     'No violation details (inspection not performed)',
 'Inspection failed but violation details are missing',
    'Conditions present but violation details missing',
         'No violation details (business not located)']
Length: 5, dtype: string