https://www.mindsumo.com/contests/sfpd-dispatch
In this Jupyter notebook, we conduct an exploratory data analysis of the SFPD Dispatch dataset provided by Capital One. Jupyter notebooks allow us to run snippets of code at a time instead of entire scripts, which is handy for data analysis. I'll be using Python alongside various tools such as pandas (an open source data science library that loads data into interactive Series and DataFrames, allowing us to query it like one would in SQL) to look for trends in the data, and calculate/display these using numpy, matplotlib and seaborn, other open source Python data science libraries. These are useful because they can directly generate visualizations that we can use on our website of this analysis, which will be here: https://shamystic.github.io/sfpd_analysis/
Note that code for generating heatmaps can be found at: https://github.com/shamystic/sfpd_analysis/blob/master/Heatmaps.ipynb. The heatmaps render in live javascript via the Google Maps API and thus cannot be viewed on a static HTML page.
Thanks for reading!
Shamikh (shamystic)
Contact: ssh50@duke.edu
# Imports
# NumPy and Pandas
import pandas as pd
import numpy as np
from numpy import median, mean
# Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
%matplotlib inline
# Google Maps
import gmaps
import gmaps.datasets
# Other
import operator
from collections import Counter
# Settings
# Display full column widths to read schema descriptions
pd.options.display.max_colwidth = 200
# Read in data and convert to pandas dataframes
df = pd.read_csv('data/sfpd_dispatch_data_subset.csv')
schema_df = pd.read_csv('data/sfpd_dispatch_schema.csv')
# Preview first 5 samples of data
df.head()
# Check features and non-null value counts
df.info()
# From this we find we have no neighborhood district information :(
schema_df.head()
# make a table with types of data
From the schema, we can know our features better, and divide them into groups:
call_number, integer
unit_id, string
incident_number, integer
row_id, string
address, string
box, string
battalion, string
station_area, string
fire_prevention_district, string
supervisor_district, string
city, string
location, string
latitude, float
longitude, float
call_date, date
watch_date, date
received_timestamp, timestamp
entry_timestamp, timestamp
dispatch_timestamp,timestamp
response_timestamp,timestamp
on_scene_timestamp,timestamp,Date and time the unit records arriving to the location of the incident
transport_timestamp, timestamp
hospital_timestamp, timestamp
available_timestamp, timestamp Can use http://strftime.org/ and convert to Pandas timestamps.
call_type, string
call_final_disposition, string
original_priority, string
priority, string
final_priority, integer
als_unit, boolean
call_type_group, string
number_of_alarms, integer
unit_sequence_in_call_dispatch, integer
# Get counts (number of occurences) for call type feature
df.call_type_group.value_counts()
# Plot counts
df['call_type'].value_counts()[:20].plot(kind='barh', figsize=[8,5])
plt.xlabel('Frequency', fontsize = 12)
plt.title('Call Type Frequency')
plt.ylabel('Call Type', fontsize = 12)
#plt.savefig('types.png')
plt.show()
# View values for one row/one emergency call sample
df.iloc[0]
# View values of call_final_disposition
set(df['call_final_disposition'])
# Plot counts of emergency calls by zipcode
df['zipcode_of_incident'].value_counts()[:30].plot(kind='barh', figsize=[10,6])
plt.xlabel('Frequency', fontsize = 12)
plt.title('Zipcode Frequency')
plt.ylabel('Zipcode of Incident', fontsize = 12)
plt.savefig('assets/zips_freq.png')
df['unit_type'].value_counts()
df['call_type'].value_counts()
type(df['dispatch_timestamp'][0])
# Time columns are strings...not too useful. Let's convert them to Pandas Timestamp objects (equivalent to
# datetime objects in Python) to be able to perform useful computations (like time differences!)
# Make a copy of the dataframe to play with (remove nan neighborhood district column while we're at it)
dfc = df.drop(['neighborhood_district'], axis = 1)
# Identify time columns
time_columns = ['received_timestamp', 'entry_timestamp', 'dispatch_timestamp', 'response_timestamp',
'on_scene_timestamp', 'transport_timestamp', 'hospital_timestamp', 'available_timestamp']
# Here we apply a lambda expression to each time column to remove the 'UTC', and then we use the to_datetime
# function in pandas to convert the types to python Timestamps instead of strings.
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Timestamp.html
for col in time_columns:
dfc[col] = pd.to_datetime(dfc[col].astype(str).apply(lambda x: x[:-3]), format='%Y-%m-%d %H:%M:%S.%f')
dfc.head()
type(dfc['dispatch_timestamp'][0])
# Great! This allows us to perform new operations.
# Test
# Create a new pandas column that is the time difference (timedelta) between received_timestamp and dispatch timestamp
dfc['received_to_dispatch'] = dfc['dispatch_timestamp'] - dfc['received_timestamp']
dfc['received_to_dispatch'].head()
"""
Function min_timedelta
Get the time differential in seconds between two TimeStamp objects.
Args:
t1: This is the time that occured first.
t2: This is the time that occured after t1.
Returns: The timedelta in float minutes.
"""
def min_timedelta(t1, t2):
return (t2 - t1).seconds/60
print (min_timedelta(dfc['dispatch_timestamp'][0], dfc['received_timestamp'][0]))
# Create new columns
# timedelta of time call is received at the 911 Dispatch Center, and time the 911 operator dispatches a unit to call.
dfc['received_to_dispatch'] = dfc['dispatch_timestamp'] - dfc['received_timestamp']
dfc['received_to_dispatch'] = dfc['received_to_dispatch'].apply(lambda x: x.seconds/60)
# timedelta of time call is received at the 911 Dispatch Center, and time the 911 operator arrives on scene.
dfc['received_to_onscene'] = dfc['on_scene_timestamp'] - dfc['received_timestamp']
dfc['received_to_onscene'] = dfc['received_to_onscene'].apply(lambda x: x.seconds/60)
# View descriptive statistics for new column, the response time in minutes
dfc['received_to_dispatch'].describe()
print (dfc['received_to_onscene'].describe())
# Max seems to be an outlier...let's see the 99th percentile
dfc['received_to_onscene'].quantile(0.99)
# It's definitely an outlier
# Quick check of distribution of response time
dfc['received_to_onscene'].plot(kind='hist', bins = np.arange(start=0, stop=40, step=2), figsize=[10,6])
# `bins` defines the start and end points of bins, np.arange creates a list [0, 100,...2000]
dfc['received_to_dispatch'].plot(kind='hist', bins = np.arange(start=0, stop=40, step=2), figsize=[10,6])
We now have the useful statistic of response time and more, and can calculate typical/average values for different groupings as we please. However, its important to note that by its very nature this dataset can contain extreme outliers-emergencies can be highly variant- and so it would be useful to compute both means (sensitive to outliers) and medians(robust to outliers) in our analysis.
# Compute average response time for each zip code
"""
Function get_group_averages
Get the average by desired grouping of a desired statistic, ex: average time to arrive for each zipcode.
Args:
df: DataFrame object to use for calculation.
group_by_feature: String representing the feature you want to group by, ex: 'zipcode_of_incident'
value: String representing the feature you want to average for each group, ex: 'received_to_onscene'
Returns: A list of tuples (group, avg_value) sorted by the value averages for each grouping.
"""
def get_group_averages(df, group_by_feature: str, value: str):
d = {}
for each in set(df[group_by_feature].values):
d[each] = np.nanmean(df[df[group_by_feature] == each][value]) # was np.nanmean
return sorted(d.items(), key=operator.itemgetter(1), reverse = True)
# Same as above, but computes median instead
def get_group_median(df, group_by_feature: str, value: str):
d = {}
for each in set(df[group_by_feature].values):
d[each] = np.nanmedian(df[df[group_by_feature] == each][value]) # was np.nanmean
return sorted(d.items(), key=operator.itemgetter(1), reverse = True)
"""
Function get_sorted_bar
Get a bargraph with sorted averages, using list of tuples created by get_group_averages.
Args:
group_averages: list of tuples (group, avg_value)
filename: string name desired for filename and plot
zipcode: Plotting zipcodes? Default false, as zipcode labels need more space on x-axis, this sets larger plotsize
Returns: Outputs and saves the bar plot (returns None)
"""
def get_sorted_bar(group_averages, filename: str, zipcode = False):
group_averages = [x for x in group_averages if x[0] != 'None']
X = [x[0] for x in group_averages]
Y = [x[1] for x in group_averages]
if zip:
plt.figure(figsize=(20,10))
else:
plt.figure(figsize=(10,5))
plt.bar(range(len(group_averages)), Y)
plt.xticks(range(len(group_averages)), X)
plt.xlabel('Time in Minutes')
plt.title(filename)
# plt.savefig('Medians/' + filename + '.png')
avg_received_onscene_time = get_group_averages(dfc, 'zipcode_of_incident', 'received_to_onscene')
get_sorted_bar(avg_received_onscene_time, 'On Scene Time Averages By Zipcode', zipcode = True)
battalion_avgs_onscene = get_group_averages(dfc, 'battalion', 'received_to_onscene')
get_sorted_bar(battalion_avgs_onscene, 'On Scene Time Averages by Battalion')
station_area_avgs_onscene = get_group_averages(dfc, 'station_area', 'received_to_onscene')
get_sorted_bar(station_area_avgs_onscene, 'On Scene Time Averages by Station Area')
fire_prevention_district_avgs_onscene = get_group_averages(dfc, 'fire_prevention_district', 'received_to_onscene')
get_sorted_bar(fire_prevention_district_avgs_onscene, 'On Scene Time Averages by Fire Prevention District')
supervisor_district_avgs_onscene = get_group_averages(dfc, 'supervisor_district', 'received_to_onscene')
get_sorted_bar(supervisor_district_avgs_onscene, 'On Scene Time Averages by Supervisor District')
city_avgs_onscene = get_group_averages(dfc, 'city', 'received_to_onscene')
get_sorted_bar(city_avgs_onscene, 'On Scene Time Averages by City')
supervisor_district_avgs_onscene
# Now let's see times to dispatch
zip_received_dispatch_time = get_group_averages(dfc, 'zipcode_of_incident', 'received_to_dispatch')
get_sorted_bar(zip_received_dispatch_time, 'Dispatch Time Averages By Zipcode', zipcode = True)
battalion_avgs_dispatch = get_group_averages(dfc, 'battalion', 'received_to_dispatch')
get_sorted_bar(battalion_avgs_dispatch, 'Dispatch Time Averages by Battalion')
station_area_avgs_dispatch = get_group_averages(dfc, 'station_area', 'received_to_dispatch')
get_sorted_bar(station_area_avgs_dispatch, 'Dispatch Time Averages by Station Area')
fire_prevention_district_avgs_dispatch = get_group_averages(dfc, 'fire_prevention_district', 'received_to_dispatch')
get_sorted_bar(fire_prevention_district_avgs_dispatch, 'Dispatch Time Averages by Fire Prevention District')
supervisor_district_avgs_dispatch = get_group_averages(dfc, 'supervisor_district', 'received_to_dispatch')
get_sorted_bar(supervisor_district_avgs_dispatch, 'Dispatch Time Averages by Supervisor District')
city_avgs_dispatch = get_group_averages(dfc, 'city', 'received_to_dispatch')
get_sorted_bar(city_avgs_dispatch, 'Dispatch Time Averages by City')
# We can now answer: Which areas take the longest time to dispatch to on average?
# Zipcode 94129, Batallion B99, City Presidio, Station Areas 51, 20
# Get longest time areas
top_onscene = {'avg_received_onscene_time' : avg_received_onscene_time[:4], 'battalion_avgs_onscene' : battalion_avgs_onscene[:4],
'station_area_avgs_onscene' : station_area_avgs_onscene[:4],
'fire_prevention_district_avgs_onscene': fire_prevention_district_avgs_onscene[:4],
'supervisor_district_avgs_onscene' : supervisor_district_avgs_onscene[:4],
'city_avgs_onscene': city_avgs_onscene[:4]}
top_dispatch = {'zip_received_dispatch_time' : zip_received_dispatch_time[:4], 'battalion_avgs_dispatch' : battalion_avgs_dispatch[:4],
'station_area_avgs_dispatch' : station_area_avgs_dispatch[:4],
'fire_prevention_district_avgs_dispatch': fire_prevention_district_avgs_dispatch[:4],
'supervisor_district_avgs_dispatch' : supervisor_district_avgs_dispatch[:4],
'city_avgs_dispatch': city_avgs_dispatch[:4]}
print (top_onscene)
print (top_dispatch)
# View the call type distribution of all incidents in zipcode 94127
# normalize lets us view proportions
dfc[dfc['zipcode_of_incident'] == 94127]['call_type'].value_counts(normalize = True)
# Mostly medical incidents!
dfc[dfc['zipcode_of_incident'] == 94127]['unit_type'].value_counts(normalize = True)
dfc[dfc['zipcode_of_incident'] == 94105]['call_type'].value_counts(normalize = True)
dfc[dfc['zipcode_of_incident'] == 94105]['unit_type'].value_counts(normalize = True)
dfc[dfc['supervisor_district'] == 7]['call_type'].value_counts(normalize = True)
dfc[dfc['supervisor_district'] == 7]['unit_type'].value_counts(normalize = True)
dfc['call_type'].value_counts()
dfc['zipcode_of_incident'].value_counts()
dfc['station_area'].value_counts()
dfc[dfc['zipcode_of_incident'] == 94129]['call_type'].value_counts(normalize = True)
# Safest- 94129, Presidio (Battalion B99), Treasure Island, 94130
dfc[dfc['zipcode_of_incident'] == 94130]['call_type'].value_counts(normalize = True)
dfc[dfc['station_area'] == 20]['zipcode_of_incident'].value_counts(normalize = True)
# Place with least amount of alarms (burglaries etc) is indicative of safe place
dfc[dfc['call_type'] == 'Alarms']['zipcode_of_incident'].value_counts(normalize = True)
# Presidio again!
# Create new feature columns of hour the call was received, and the day of the week it was received
dfc['hour'] = dfc['received_timestamp'].apply(lambda x: x.hour)
dfc['day'] = dfc['received_timestamp'].apply(lambda x: x.weekday_name)
dfc.head()
"""
Function assign_day_portion
Assigns categorical values to hour call was received to get a new feature representing a chunk of the 24-hr day.
Args:
x: The hour the call was received
Returns: The portion of the day, divided into Night Hours, Morning to Late Afternoon, and Evening to Midnight,
in which the call was received.
"""
def assign_day_portion(x):
if (x >= 0 and x <= 7) or (x == 24): # 0:00 to 7:00 (AM) - Night Hours
return '12 AM to 7 AM'
if x >= 8 and x <= 15: # 8 AM to 3 PM - Morning to Late Afternoon
return '8 AM to 3 PM'
if x >= 16 and x < 24: # 4 PM to 11 PM - Evening to Midnight
return '4 PM to 11 PM'
dfc['day_portion'] = dfc['hour'].apply(assign_day_portion)
dfc.head()
# Which hours had the highest proportion of emergency calls in the month of January?
dfc['hour'].value_counts(normalize = True)
dfc['day_portion'].value_counts()
# Most common call types for each time range
portions = ['8 AM to 3 PM', '4 PM to 11 PM' , '12 AM to 7 AM']
for portion in portions:
print ('Most common call type for', portion)
print (dfc[dfc['day_portion'] == portion]['call_type'].value_counts(normalize = True)[:3])
# How about for every hour?
for x in np.arange(0, 24, 1):
print (x, dfc[dfc['hour'] == x]['call_type'].value_counts(normalize = True)[:2])
# Medical Incidents are the most common calls at every hour of the day...
# New feature - difference between when the operator dispatches the unit and when unit reports that it is en route
# (We can check if there is a time of day in which this is more sluggish?)
dfc['dispatch_to_response'] = dfc['response_timestamp'] - dfc['dispatch_timestamp']
dfc['dispatch_to_response'] = dfc['dispatch_to_response'].apply(lambda x: x.seconds/60)
dfc['response_to_onscene'] = dfc['on_scene_timestamp'] - dfc['response_timestamp']
dfc['response_to_onscene'] = dfc['response_to_onscene'].apply(lambda x: x.seconds/60)
# Quick plot of the distribution
plt.hist(dfc['dispatch_to_response'].dropna(), alpha = .5, bins = np.arange(0, 5, .1), label='dispatch_to_response')
# Is there a time in the day officers are more sluggish in responding to dispatches?
# Let's see descriptive statistics for each day portion
dfc[dfc['day_portion'] == '8 AM to 3 PM']['dispatch_to_response'].describe()
dfc[dfc['day_portion'] == '4 PM to 11 PM']['dispatch_to_response'].describe()
dfc[dfc['day_portion'] == '12 AM to 7 AM']['dispatch_to_response'].describe()
# Plot distributions of two day portions to compare them
x = dfc[dfc['day_portion'] == '4 PM to 11 PM']['dispatch_to_response'].dropna()
y = dfc[dfc['day_portion'] == '12 AM to 7 AM']['dispatch_to_response'].dropna()
# z = dfc[dfc['day_portion'] == '8 AM to 3 PM']['dispatch_to_response'].dropna()
bins = np.arange(0, 5, .1)
plt.figure(figsize=(14,8))
plt.hist(x, bins, alpha=0.3, label='4 PM to 11 PM', density = True)
plt.hist(y, bins, alpha=0.3, label='12 AM to 7 AM', density = True)
#pyplot.hist(z, bins, alpha=0.3, label='8 AM to 3 PM', density = True)
plt.legend(loc='upper right')
plt.title('Density of Dispatch to Response Time for Day Portions')
plt.xlabel('Minutes Taken to Respond to Dispatch Alert')
plt.ylabel('Density')
plt.savefig('time_dist.png')
plt.show()
# All three - might be difficult to distinguish between them
x = dfc[dfc['day_portion'] == '4 PM to 11 PM']['dispatch_to_response'].dropna()
x2 = dfc[dfc['day_portion'] == '12 AM to 7 AM']['dispatch_to_response'].dropna()
x3 = dfc[dfc['day_portion'] == '8 AM to 3 PM']['dispatch_to_response'].dropna()
bins = np.arange(0, 5, .1)
plt.figure(figsize=(13,8))
n, bins, patches = plt.hist(x.dropna(), bins, normed=1, facecolor='salmon', alpha=0.2,
label='4 PM to 11 PM', edgecolor=(1,1,1))
n, bins, patches = plt.hist(x2.dropna(), bins, normed=1, facecolor='darkseagreen', alpha=0.2,
label='12 AM to 7 AM', edgecolor=(1,1,1))
n, bins, patches = plt.hist(x3.dropna(), bins, normed=1, facecolor='lightblue', alpha=0.2,
label='12 AM to 7 AM', edgecolor=(1,1,1))
# mean and standard deviation calc's
mu, sigma = np.mean(x), np.std(x)
mu2, sigma2 = np.mean(x2), np.std(x2)
mu3, sigma3 = np.mean(x3), np.std(x3)
# fit and overlay a 'best fit' line by approximating normal distribution
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r--')
y2 = mlab.normpdf(bins, mu2, sigma2)
plt.plot(bins, y2, 'g--')
# y3 = mlab.normpdf(bins, mu3, sigma3)
# plt.plot(bins, y3, 'b--')
plt.xlabel('Minutes Taken to Respond to Dispatch Alert', fontsize=17)
plt.ylabel('Probability', fontsize=18)
plt.title('Distribution of Minutes Taken to Respond to Dispatch Alert', fontsize=28)
red_patch = mpatches.Patch(color='salmon', label='4 PM to 11 PM')
green_patch = mpatches.Patch(color='darkseagreen', label='12 AM to 7 AM')
blue = mpatches.Patch(color='lightblue', label='8 AM to 3 PM')
plt.legend(handles=[red_patch, green_patch, blue], prop={'size': 16})
# tweak spacing to prevent clipping of ylabel and set axis ranges to include all values
plt.subplots_adjust(left=0.1)
#pylab.ylim([0,y.max()])
# pylab.xlim([x.min(),x.max()])
plt.show()
# Let's try just two that seem to have the most difference- late night and late evening, with new colors
x = dfc[dfc['day_portion'] == '4 PM to 11 PM']['dispatch_to_response'].dropna()
x2 = dfc[dfc['day_portion'] == '12 AM to 7 AM']['dispatch_to_response'].dropna()
bins = np.arange(0, 6, .08)
plt.figure(figsize=(16,8))
n, bins, patches = plt.hist(x.dropna(), bins, density = True, color='lightblue', alpha=0.3,
label='4 PM to 11 PM', edgecolor=(1,1,1))
n, bins, patches = plt.hist(x2.dropna(), bins, density = True, color='darkseagreen', alpha=0.3,
label='12 AM to 7 AM', edgecolor=(1,1,1))
# mean and standard deviation calc's
mu, sigma = np.mean(x), np.std(x)
mu2, sigma2 = np.mean(x2), np.std(x2)
# add a 'best fit' line
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'b--')
y2 = mlab.normpdf(bins, mu2, sigma2)
plt.plot(bins, y2, 'g--')
plt.xlabel('Minutes Taken to Respond to Dispatch Alert', fontsize=17)
plt.ylabel('Density', fontsize=18)
plt.title('Distribution of Minutes Taken to Respond to Dispatch Alert', fontsize=28)
lightblue = mpatches.Patch(color='lightblue', label='4 PM to 11 PM')
green_patch = mpatches.Patch(color='darkseagreen', label='12 AM to 7 AM')
plt.legend(handles=[lightblue, green_patch], prop={'size': 16})
# tweak spacing to prevent clipping of ylabel and set axis ranges to include all values
plt.subplots_adjust(left=0.1)
#pylab.ylim([0,y.max()])
# pylab.xlim([x.min(),x.max()])
plt.savefig('time_dist.png')
plt.show()
Police are quickest during late evening hours (4 PM - 11 PM), in the middle in the early morning to late afternoon (8 AM to 3 PM), and slowest at late night hours from 12 AM and 7 AM.
# Distributions by day portion for response time with box and whisker plots
temp = dfc.sort_values('dispatch_to_response', ascending = False)
temp = temp.iloc[10:] # drop extreme outliers (hurts to do, but boxplot is invisible with them!)
sns.set(rc={'figure.figsize':(14,10)})
sns.boxplot(x="day_portion", y="dispatch_to_response", data=temp)
plt.title('Box Plot of Dispatch to Response Time for Day Portions', fontsize = 20)
plt.xlabel('Day Portion', fontsize = 16)
plt.ylabel('Minutes Taken to Respond to Dispatch Alert', fontsize = 16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
#plt.savefig('assets/box.png')
# Which hour are emergency responders the slowest to reach on scene?
get_group_averages(dfc, 'hour', 'received_to_onscene')
get_sorted_bar(get_group_averages(dfc, 'hour', 'dispatch_to_response'), 'Average Dispatch to Response Time by Hour in Timestamp')
# Plot median response time for each zipcode nicely
zips = get_group_median(dfc, 'hour', 'dispatch_to_response')
X = [x[0] for x in zips]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="hour", y="dispatch_to_response", data=dfc, estimator=median, palette="summer",
order = X)
plt.title('Median Time to Respond to Dispatch Alert by Hour Call is Received', fontsize = 30)
plt.xlabel('Hour Call is Received (24-hr Time)', fontsize = 30)
plt.ylabel('Median Dispatch to Response Time', fontsize = 30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.savefig('assets/hour_response.png')
# Quick check of distribution
plt.hist(dfc['response_to_onscene'].dropna(), alpha = .5, bins = np.arange(0, 5, .1), label='dispatch_to_response')
get_group_averages(dfc, 'day_portion', 'response_to_onscene')
# Plot all three distributions to check for trends
x = dfc[dfc['day_portion'] == '4 PM to 11 PM']['response_to_onscene'].dropna()
x2 = dfc[dfc['day_portion'] == '12 AM to 7 AM']['response_to_onscene'].dropna()
x3 = dfc[dfc['day_portion'] == '8 AM to 3 PM']['response_to_onscene'].dropna()
bins = np.arange(0, 5, .1)
plt.figure(figsize=(13,8))
n, bins, patches = plt.hist(x.dropna(), bins, normed=1, facecolor='salmon', alpha=0.2,
label='4 PM to 11 PM', edgecolor=(1,1,1))
n, bins, patches = plt.hist(x2.dropna(), bins, normed=1, facecolor='darkseagreen', alpha=0.2,
label='12 AM to 7 AM', edgecolor=(1,1,1))
n, bins, patches = plt.hist(x3.dropna(), bins, normed=1, facecolor='lightblue', alpha=0.2,
label='12 AM to 7 AM', edgecolor=(1,1,1))
# mean and standard deviation calc's
mu, sigma = np.mean(x), np.std(x)
mu2, sigma2 = np.mean(x2), np.std(x2)
mu3, sigma3 = np.mean(x3), np.std(x3)
plt.xlabel('Minutes Taken to Respond to Dispatch Alert', fontsize=17)
plt.ylabel('Probability', fontsize=18)
plt.title('Distribution of Minutes Taken to Respond to Dispatch Alert', fontsize=28)
red_patch = mpatches.Patch(color='salmon', label='4 PM to 11 PM')
green_patch = mpatches.Patch(color='darkseagreen', label='12 AM to 7 AM')
blue = mpatches.Patch(color='lightblue', label='8 AM to 3 PM')
plt.legend(handles=[red_patch, green_patch, blue], prop={'size': 16})
# tweak spacing to prevent clipping of ylabel and set axis ranges to include all values
plt.subplots_adjust(left=0.1)
#pylab.ylim([0,y.max()])
# pylab.xlim([x.min(),x.max()])
plt.show()
def value_count(df):
df['call_type_group'].value_counts()[:20].plot(kind='barh', figsize=[10,6])
plt.xlabel('Frequency', fontsize = 12)
plt.title('Call Type Frequency')
plt.ylabel('Call Type', fontsize = 12)
value_count(dfc[dfc.zipcode_of_incident == 94105])
# Let's use some outside data!
# These are python dictionaries generated by the script scrape_incomes.py
# They provide median income and population measurements for each zipcode in San Francisco
income_dict = {94127: 95313, 94105: 88976, 94123: 84710, 94130: 80959, 94131: 76044, 94114: 75727, 94129: 73571, 94116: 66627, 94117: 63983, 94121: 61776, 94118: 61609, 94107: 61362, 94122: 60733, 94112: 57629, 94111: 56569, 94132: 55000, 94115: 54879, 94134: 54342, 94110: 53795, 94109: 43444, 94133: 40990, 94124: 37146, 94108: 31542, 94103: 31131, 94102: 22351, 94104: 14609}
population_dict = {94127: 20624, 94105: 2058, 94123: 22903, 94130: 1453, 94131: 27897, 94114: 30574, 94129: 2228, 94116: 42958, 94117: 38738, 94121: 42473, 94118: 38939, 94107: 17368, 94122: 55492, 94112: 73104, 94111: 3335, 94132: 26291, 94115: 33115, 94134: 40134, 94110: 74633, 94109: 56322, 94133: 26827, 94124: 33170, 94108: 13716, 94103: 23016, 94102: 28991, 94104: 374}
# Function to read values from dictionary and put into right place in our dataframe
def read_dict(d, x):
if x in d:
return d[x]
return np.nan
# Create income and population columns
dfc['avg_income'] = dfc['zipcode_of_incident'].apply(lambda x: read_dict(income_dict, x))
dfc['population'] = dfc['zipcode_of_incident'].apply(lambda x: read_dict(population_dict, x))
dfc.head()
# Quick check of distribution
dfc['population'].hist()
dfc['avg_income'].hist()
dfc['avg_income'].describe()
# Assign lower, lower-middle, upper-middle, upper to income quartiles categorical variable for analysis by group
def assign_income_band(x):
if x <= 25000:
return '0 - 25,000'
if x <= 50000:
return '25,000 - 50,000'
if x <= 75000:
return '50,000 - 75,000'
return '75,000 - 100,000'
dfc['zip_income_band'] = dfc['avg_income'].apply(assign_income_band)
dfc.head()
# Check frequency proportion by income band
dfc['zip_income_band'].value_counts(normalize = True)
# Middle class seems to have most emergencies, *but* as seen in the distribution above, they're also the most common
# Barplot of income band frequency proportions
dfc['zip_income_band'].value_counts(normalize = True).plot(kind = 'barh')
# Check means and medians
print (get_group_averages(dfc, 'zip_income_band', 'received_to_onscene'))
print (get_group_median(dfc, 'zip_income_band', 'received_to_onscene'))
Yes, if we roughly equate emergency frequencies in January to likelihood.
Let's write an algorithm that goes through our DataFrame and pulls the proportional frequency for various call types (Medical incident, fire, etc.) for a given set of parameters, namely a zipcode or station area and a time of day or portion of day. If we get too specific, it will be hard to find enough instances to generalize, so we can keep it to these two parameters for now. For example, there may only be one emergency that occured on a Wednesday at 3 PM in zipcode 94127, so being too specific won't let us interpret our results as a likelihood.
The frequency in January is probably only roughly indicative of a likelihood given an address and time, but we do have 10,0000 calls, so it's worth a shot.
The python dictionary generated from this can be used in Javascript to create a small functional part on our website which outputs the top most frequent (and thereby "likely") emergency types given an zipcode and time input by the user.
We need to do this without being too specific - zipcode and station areas have enough different values to be useful (box would be impractical because there is too many of them, whereas there's too few battalions). Thus these provide us with a balanced granularity in terms of location. As for time, we can try using both day portions (day divided into 7 hour periods) or hours directly.
# This is how we can subset the dataframe given a location and time
dfc[(dfc.zipcode_of_incident == 94129) & (dfc.day_portion == '4 PM to 11 PM')].head()
# This is how we can get frequencies given parameters
dfc[(dfc.zipcode_of_incident == 94124) & (dfc.day_portion == '12 AM to 7 AM')]['call_type'].value_counts(normalize = True)
# Most likely dispatches, given an address (zipcode) and a time
# Filters DataFrame with desired parameters and gets frequency proportions for type of emergency
d = {}
for zipcode in set(dfc['zipcode_of_incident']):
times = []
for day_portion in ['8 AM to 3 PM', '4 PM to 11 PM', '12 AM to 7 AM']:
value_counts = dfc[(dfc.zipcode_of_incident == zipcode) & (dfc.day_portion == day_portion)]['call_type'].value_counts(normalize = True)
val1 = '1. ' +value_counts.index[0] + ' - ' + str(round(value_counts[0]*100, 0)) + '%;'
if len(value_counts) == 1:
times.append(val1)
elif len(value_counts) == 2:
val2 = '2. ' + value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
times.append(val1 + ' ' + val2)
else:
val2 = '2. ' +value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
val3 = '3. ' + value_counts.index[2] + ' - ' + str(round(value_counts[2]*100, 0)) + '%;'
times.append(val1 + ' ' + val2 + ' ' + val3)
d[zipcode] = times
print (d)
# Test for zipcode 94105 at Midnight
print (d[94105][0])
for x in set(dfc['zip_income_band']):
print (x, '\n', dfc[dfc['zip_income_band'] == x]['call_type'].value_counts(normalize = True)[:2])
# Medical Incidents are the most common calls at every income band
# Frequency proportion for medical incidents by income quartile
dfc[dfc.call_type == 'Medical Incident'].zip_income_band.value_counts(normalize = True)
dfc[dfc.call_type == 'Alarms'].zip_income_band.value_counts(normalize = True)
# Which times of the day have the most emergencies?
dfc['day_portion'].value_counts()
# Distribution of received_to_onscene times- extreme outliers!
ax = sns.distplot(dfc['received_to_onscene'].dropna())
# Barplot response time to each battalion
values = get_group_averages(dfc, 'battalion', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="battalion", y="received_to_onscene", data=dfc, estimator=mean, palette="coolwarm_r",
order = X)
plt.title('Mean Received to On Scene Time by Battalion', fontsize = 22)
plt.xlabel('Battalion of Incident', fontsize = 22)
plt.ylabel('Mean Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
#plt.savefig('assets/mean_battalion_response.png')
values = get_group_median(dfc, 'battalion', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="battalion", y="received_to_onscene", data=dfc, estimator=median, palette="coolwarm_r",
order = X)
plt.title('Median Received to On Scene Time by Battalion', fontsize = 22)
plt.xlabel('Battalion of Incident', fontsize = 22)
plt.ylabel('Median Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
#plt.savefig('assets/median_battalion_response.png')
values = get_group_averages(dfc, 'station_area', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="station_area", y="received_to_onscene", data=dfc, estimator=mean, palette="GnBu_r",
order = X)
plt.title('Mean Received to On Scene Time by Station Area', fontsize = 22)
plt.xlabel('Station Area of Incident', fontsize = 22)
plt.ylabel('Mean Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
#plt.savefig('assets/mean_sa_response.png')
values = get_group_median(dfc, 'station_area', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="station_area", y="received_to_onscene", data=dfc, estimator=median, palette="GnBu_r",
order = X)
plt.title('Median Received to On Scene Time by Station Area', fontsize = 22)
plt.xlabel('Station Area of Incident', fontsize = 22)
plt.ylabel('Median Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
#plt.savefig('assets/median_sa_response.png')
values = get_group_averages(dfc, 'zipcode_of_incident', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="zipcode_of_incident", y="received_to_onscene", data=dfc, estimator=mean, palette="Spectral",
order = X)
plt.title('Mean Response Time by Zipcode', fontsize = 30)
plt.xlabel('Zipcode of Incident', fontsize = 28)
plt.ylabel('Mean Time in Minutes to Arrive On-Scene After Call', fontsize = 26)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
#plt.savefig('assets/mean_zipcode_response.png')
values = get_group_median(dfc, 'zipcode_of_incident', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="zipcode_of_incident", y="received_to_onscene", data=dfc, estimator=median, palette="winter",
order = X)
plt.title('Median Response Time by Zipcode', fontsize = 30)
plt.xlabel('Zipcode of Incident', fontsize = 28)
plt.ylabel('Median Time in Minutes to Arrive On-Scene After Call', fontsize = 26)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
#plt.savefig('assets/median_zipcode_response.png')
values = get_group_averages(dfc, 'fire_prevention_district', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="fire_prevention_district", y="received_to_onscene", data=dfc, estimator=mean, palette="winter",
order = X)
plt.title('Mean Received to On Scene Time by Fire Prevention District', fontsize = 22)
plt.xlabel('Fire Prevention District of Incident', fontsize = 22)
plt.ylabel('Mean Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
#plt.savefig('assets/mean_fpd_response.png')
values = get_group_median(dfc, 'fire_prevention_district', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="fire_prevention_district", y="received_to_onscene", data=dfc, estimator=median, palette="winter",
order = X)
plt.title('Median Received to On Scene Time by Fire Prevention District', fontsize = 22)
plt.xlabel('Fire Prevention District of Incident', fontsize = 22)
plt.ylabel('Median Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
#plt.savefig('assets/median_fpd_response.png')
values = get_group_averages(dfc, 'supervisor_district', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="supervisor_district", y="received_to_onscene", data=dfc, estimator=mean, palette="rocket",
order = X)
plt.title('Mean Received to On Scene Time by Supervisor District', fontsize = 22)
plt.xlabel('Supervisor District of Incident', fontsize = 22)
plt.ylabel('Mean Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
#plt.savefig('assets/mean_sd_response.png')
values = get_group_median(dfc, 'supervisor_district', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="supervisor_district", y="received_to_onscene", data=dfc, estimator=median, palette="rocket",
order = X)
plt.title('Median Received to On Scene Time by Supervisor District', fontsize = 22)
plt.xlabel('Supervisor District of Incident', fontsize = 22)
plt.ylabel('Median Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
#plt.savefig('assets/median_sd_response.png')
values = get_group_median(dfc, 'city', 'received_to_onscene')
X = [x[0] for x in values]
sns.set(rc={'figure.figsize':(26,12)})
ax = sns.barplot(x="city", y="received_to_onscene", data=dfc, estimator=median, palette="rocket",
order = X)
plt.title('Median Received to On Scene Time by City', fontsize = 22)
plt.xlabel('City of Incident', fontsize = 22)
plt.ylabel('Median Time in Minutes Needed to Arrive On-Scene After Call', fontsize = 22)
plt.xticks(fontsize=18)
#plt.savefig('assets/city_response.png')
incomes = get_group_averages(dfc, 'zip_income_band', 'received_to_onscene')
X = [x[0] for x in incomes]
sns.set(rc={'figure.figsize':(15,10)})
ax = sns.barplot(x="zip_income_band", y="received_to_onscene", data=dfc, estimator=median, palette="Greens_r",
order = X)
plt.title('Median Time Needed to Arrive On-Scene by Income Quartile', fontsize = 20)
plt.xlabel('Income Quartile', fontsize = 18)
plt.ylabel('Median Minutes Needed to Arrive On-Scene After Call', fontsize = 18)
plt.xticks(fontsize=20)
#plt.savefig('assets/incomebands.png')
incomes = get_group_median(dfc, 'zip_income_band', 'dispatch_to_response')
X = [x[0] for x in incomes]
sns.set(rc={'figure.figsize':(12,8)})
ax = sns.barplot(x="zip_income_band", y="dispatch_to_response", data=dfc, estimator=median, palette="Blues_d",
order = X)
plt.title('Median Time Needed to Respond to Dispatches by Income Quartile', fontsize = 20)
plt.xlabel('Income Quartile', fontsize = 18)
plt.ylabel('Minutes passed before en route after notified of dispatch', fontsize = 16)
plt.xticks(fontsize=14)
#plt.savefig('assets/incomebands2.png')
d = Counter(df['zipcode_of_incident'])
X = sorted(d, key=d.get, reverse= True)
sns.set(rc={'figure.figsize':(26,12)})
sns.countplot(x="zipcode_of_incident", data=dfc, palette="GnBu_r", order = X)
plt.title('Emergency Frequency by Zipcode', fontsize = 30)
plt.xlabel('Zipcode of Incident', fontsize = 20)
plt.ylabel('Count', fontsize = 20)
plt.xticks(fontsize=17)
plt.yticks(fontsize=18)
#plt.savefig('assets/zip_counts.png')
d = Counter(df['call_type'])
X = sorted(d, key=d.get, reverse= True)
sns.set(rc={'figure.figsize':(10,6)})
sns.countplot(y="call_type", data=dfc, palette="Set3", order = X)
plt.title('Call Type Frequency', fontsize = 20)
plt.xlabel('Count', fontsize = 16)
plt.ylabel('Type of Emergency', fontsize = 16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
#plt.savefig('assets/ctt_counts.png')
d = Counter(dfc['hour'])
X = sorted(d, key=d.get, reverse= True)
sns.set(rc={'figure.figsize':(14,8)})
sns.countplot(x="hour", data=dfc, palette="Blues_r", order = X)
plt.title('Hour of Call Received Frequency', fontsize = 20)
plt.xlabel('Hour (24 hr-time) Call is Received', fontsize = 16)
plt.ylabel('Count', fontsize = 16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
#plt.savefig('assets/hour_counts.png')
temp = dfc.sort_values('dispatch_to_response', ascending = False)
temp = temp.iloc[10:] # drop extreme outliers (hurts to do, but boxplot is invisible with them!)
# Distribution of response time by Income Quartile
sns.set(rc={'figure.figsize':(14,8)})
sns.boxplot(x="zip_income_band", y="dispatch_to_response", data=temp, order = ['75,000 - 100,000', '50,000 - 75,000', '25,000 - 50,000','0 - 25,000'])
plt.title('Box Plot of Dispatch to Response Time for Income Quartiles', fontsize = 20)
plt.xlabel('Income Quartile', fontsize = 16)
plt.ylabel('Minutes Taken to Respond to Dispatch Alert', fontsize = 16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('assets/box_income.png')
# Emergency Frequency by Station Area
d = Counter(df['station_area'])
X = sorted(d, key=d.get, reverse= True)
sns.set(rc={'figure.figsize':(18,8)})
sns.countplot(x="station_area", data=dfc, palette="GnBu_r", order = X)
plt.title('Emergency Frequency by Station Area', fontsize = 20)
plt.xlabel('Station Area of Incident', fontsize = 16)
plt.ylabel('Count', fontsize = 16)
plt.xticks(fontsize=11)
plt.yticks(fontsize=16)
#plt.savefig('assets/sa.png')
dfc[dfc['zipcode_of_incident'] == 94103]['call_final_disposition'].value_counts()
# Which zipcodes have the lowest amount of transportation incidents (vehicle fires, collisions, train accidents)
dfc[(dfc['call_type'] == 'Vehicle Fire') | (dfc['call_type'] == 'Traffic Collision') | (dfc['call_type'] == 'Train / Rail Incident')]['zipcode_of_incident'].value_counts()
Now let's try to use specific hour instead of day portion, and station areas instead of zipcodes.
# Most likely dispatches, given an address (zipcode) and a specific hour
d = {}
for zipcode in set(dfc['zipcode_of_incident']):
times = []
for hour in np.arange(0, 24, 1):
value_counts = dfc[(dfc.zipcode_of_incident == zipcode) & (dfc.hour == hour)]['call_type'].value_counts(normalize = True)
try:
val1 = '1. ' +value_counts.index[0] + ' - ' + str(round(value_counts[0]*100, 0)) + '%;'
if len(value_counts) == 1:
times.append(val1)
elif len(value_counts) == 2:
val2 = '2. ' + value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
times.append(val1 + ' ' + val2)
else:
val2 = '2. ' +value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
val3 = '3. ' + value_counts.index[2] + ' - ' + str(round(value_counts[2]*100, 0)) + '%;'
times.append(val1 + ' ' + val2 + ' ' + val3)
except:
times.append('Unknown')
continue
d[zipcode] = times
print (d)
# Most likely dispatches, given an address (zipcode) and a time
d = {}
for sa in set(dfc['station_area']):
times = []
for day_portion in ['8 AM to 3 PM', '4 PM to 11 PM', '12 AM to 7 AM']:
value_counts = dfc[(dfc.station_area == sa) & (dfc.day_portion == day_portion)]['call_type'].value_counts(normalize = True)
try:
val1 = '1. ' +value_counts.index[0] + ' - ' + str(round(value_counts[0]*100, 0)) + '%;'
if len(value_counts) == 1:
times.append(val1)
elif len(value_counts) == 2:
val2 = '2. ' + value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
times.append(val1 + ' ' + val2)
else:
val2 = '2. ' +value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
val3 = '3. ' + value_counts.index[2] + ' - ' + str(round(value_counts[2]*100, 0)) + '%;'
times.append(val1 + ' ' + val2 + ' ' + val3)
except:
times.append('Unknown')
continue
d[sa] = times
print (d)
# Let's calculate which areas are experiencing the greatest increase in dispatch calls
dfc['dayofyear'] = dfc['received_timestamp'].apply(lambda x: x.dayofyear)
set(dfc['dayofyear'])
df_13 = dfc[dfc.dayofyear == 13]
df_24 = dfc[dfc.dayofyear == 24]
df_13['zipcode_of_incident'].value_counts()
df_24['zipcode_of_incident'].value_counts()
dfc[dfc['station_area'] == 3]['call_type'].value_counts(normalize = True)
dfc[dfc['zipcode_of_incident'] == 94102]['call_type'].value_counts(normalize = True)
dfc[dfc['zipcode_of_incident'] == 94102]['call_type'].value_counts(normalize = True)
dfc['call_type'].value_counts(normalize = True)
dfc[dfc['call_type'] == 'Structure Fire']['zipcode_of_incident'].value_counts(normalize = True)
dfc[dfc['zipcode_of_incident'] == 94110]['call_type'].value_counts(normalize = True)
dfc['call_type'].value_counts()
dfc[dfc['call_type'] == 'Gas Leak (Natural and LP Gases)']['zipcode_of_incident'].value_counts(normalize = True)
# Most likely dispatches, given an address (zipcode) and a specific hour
d = {}
for zipcode in set(dfc['zipcode_of_incident']):
times = []
for hour in np.arange(0, 24, 1):
value_counts = dfc[(dfc.zipcode_of_incident == zipcode) & (dfc.hour == hour)]['call_type'].value_counts(normalize = True)
try:
val1 = '1. ' +value_counts.index[0] + ' - ' + str(round(value_counts[0]*100, 0)) + '%;'
if len(value_counts) == 1:
times.append(val1)
elif len(value_counts) == 2:
val2 = '2. ' + value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
times.append(val1 + ' ' + val2)
else:
val2 = '2. ' +value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
val3 = '3. ' + value_counts.index[2] + ' - ' + str(round(value_counts[2]*100, 0)) + '%;'
times.append(val1 + ' ' + val2 + ' ' + val3)
except:
times.append('Unknown')
continue
d[zipcode] = times
print (d)
dfc[dfc['call_type'] == 'Structure Fire']['zipcode_of_incident'].value_counts()
# Most likely dispatches, given an address (zipcode) and a specific hour, with UNIT TYPE instead.
d = {}
for zipcode in set(dfc['zipcode_of_incident']):
times = []
for hour in np.arange(0, 24, 1):
value_counts = dfc[(dfc.zipcode_of_incident == zipcode) & (dfc.hour == hour)]['unit_type'].value_counts(normalize = True)
try:
val1 = '1. ' +value_counts.index[0] + ' - ' + str(round(value_counts[0]*100, 0)) + '%;'
if len(value_counts) == 1:
times.append(val1)
elif len(value_counts) == 2:
val2 = '2. ' + value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
times.append(val1 + ' ' + val2)
else:
val2 = '2. ' +value_counts.index[1] + ' - ' + str(round(value_counts[1]*100, 0)) + '%;'
val3 = '3. ' + value_counts.index[2] + ' - ' + str(round(value_counts[2]*100, 0)) + '%;'
times.append(val1 + ' ' + val2 + ' ' + val3)
except:
times.append('Unknown')
continue
d[zipcode] = times
print (d)