Baltimore Crime Visualization

Ayokiitan Akala

In [1]:
import folium
import requests
import pandas
from functools import reduce
import numpy as np
from folium.plugins import HeatMap
import random

# Pull data from online csv file
arrest_table = pandas.read_csv("http://www.hcbravo.org/IntroDataSci/misc/BPD_Arrests.csv")

# tidy up data to make EPA and visualization easier
arrest_table["race_new"] = arrest_table["sex"]
arrest_table["sex_new"] = arrest_table["race"]
arrest_table["race"] = arrest_table["race_new"]
arrest_table["sex"] = arrest_table["sex_new"]
arrest_table = arrest_table.drop('race_new', 1)
arrest_table = arrest_table.drop('sex_new', 1)

arrest_table = arrest_table[pandas.notnull(arrest_table["Location 1"])]

arrest_table["lat"], arrest_table["long"] = arrest_table["Location 1"].str.split(",").str
arrest_table["lat"] = arrest_table["lat"].str.replace("(", "").astype(float)
arrest_table["long"] = arrest_table["long"].str.replace(")", "").astype(float)

arrest_table.head()
Out[1]:
arrest age sex race arrestDate arrestTime arrestLocation incidentOffense incidentLocation charge chargeDescription district post neighborhood Location 1 lat long
1 11127013.0 37 M B 01/01/2011 00:01 2000 Wilkens Ave 79-Other Wilkens Av & S Payson St 1 1425 Reckless Endangerment || Hand Gun Violation SOUTHERN 934.0 Carrollton Ridge (39.2814026274, -76.6483635135) 39.281403 -76.648364
2 11126887.0 46 M B 01/01/2011 00:01 2800 Mayfield Ave Unknown Offense NaN NaN Unknown Charge NORTHEASTERN 415.0 Belair-Edison (39.3227699160, -76.5735750473) 39.322770 -76.573575
3 11126873.0 50 M B 01/01/2011 00:04 2100 Ashburton St 79-Other 2100 Ashburton St 1 1106 Reg Firearm:Illegal Possession || Hgv WESTERN 735.0 Panway/Braddish Avenue (39.3117196723, -76.6623546313) 39.311720 -76.662355
4 11126968.0 33 M B 01/01/2011 00:05 4000 Wilsby Ave Unknown Offense 1700 Aliceanna St NaN Unknown Charge NORTHERN 525.0 Pen Lucy (39.3382885254, -76.6045667070) 39.338289 -76.604567
5 11127041.0 41 M B 01/01/2011 00:05 2900 Spellman Rd 81-Recovered Property 2900 Spelman Rd 1 1425 Reckless Endangerment || Handgun Violation SOUTHERN 924.0 Cherry Hill (39.2449886230, -76.6273582432) 39.244989 -76.627358
In [2]:
# iterate over arrest table to count instances of crimes in districts
m = dict()
for i in arrest_table.iterrows():
    element = i[1]
    if element[11] not in m:
        m[element[11]] = 1
    else:
        m[element[11]] += 1
        
# take a random sample of 500 different crime for heat map
random_sample = arrest_table.sample(n=500)


# functionally scale the dictionary created to the sample size
sample_size = 150
m = {k: v//1000 for k, v in m.items()}
count = sum(m.values())
m = {k: int((v/count)*(sample_size)) for k, v in m.items()}

# create 9 different dataframes for all the different districts 
districts = []
d_names = list(m.keys())
d_names.pop()
for k,v in m.items():
    d = arrest_table.loc[arrest_table['district'] == k]
    districts.append(d)
    
# this is one of the 9 dataframes for the southern district
districts[0].head()
Out[2]:
arrest age sex race arrestDate arrestTime arrestLocation incidentOffense incidentLocation charge chargeDescription district post neighborhood Location 1 lat long
1 11127013.0 37 M B 01/01/2011 00:01 2000 Wilkens Ave 79-Other Wilkens Av & S Payson St 1 1425 Reckless Endangerment || Hand Gun Violation SOUTHERN 934.0 Carrollton Ridge (39.2814026274, -76.6483635135) 39.281403 -76.648364
5 11127041.0 41 M B 01/01/2011 00:05 2900 Spellman Rd 81-Recovered Property 2900 Spelman Rd 1 1425 Reckless Endangerment || Handgun Violation SOUTHERN 924.0 Cherry Hill (39.2449886230, -76.6273582432) 39.244989 -76.627358
30 11126920.0 32 M W 01/01/2011 01:23 1100 S Charles St 79-Other 1100 S Charles St 2 2220 Trespass: Private Property || Trespassing SOUTHERN 942.0 Federal Hill (39.2763729980, -76.6142946733) 39.276373 -76.614295
59 11127217.0 44 F B 01/01/2011 15:45 2200 Annapolis Rd 4E-Common Assault 2200 Annapolis Rd 1 1415 Asslt-Sec Degree || Assault SOUTHERN 922.0 Westport (39.2638662578, -76.6335084536) 39.263866 -76.633508
60 11127123.0 43 M B 01/01/2011 15:45 2200 Annapolis Rd 4E-Common Assault 2200 Annapolis Rd 1 4200 Alc. Bev./Intox:Endanger || Assault SOUTHERN 922.0 Westport (39.2638662578, -76.6335084536) 39.263866 -76.633508
In [3]:
# populate the 9 dataframes created
sample = []
for d in range(len(districts)-1):
    temp = districts[d].head(1)
    name = temp['district'].values[0]
    num = m[name]
    sample.append(arrest_table.sample(n=num))


# gather crime statistics on the different districts to make broadcast in our marker
crime_stats = {}
for d in range(len(districts)):
    for i in districts[d].iterrows():
        elements = i[1]
        n = elements[11]
        crime = elements[7]
        if n not in crime_stats:
            crime_stats[n] = {} 
        if crime not in crime_stats[n]:
            crime_stats[n][crime] = 0
        crime_stats[n][crime] += 1
       
# Select the top 5 offenses and create and HTML representation of our findings
top_five = {}
for d in crime_stats.keys():
    curr_crime = crime_stats[d]
    total_dist = sum(crime_stats[d].values())
    top_five[d] = [str(k + ': ' + str(curr_crime[k]) + " <b>(" + str(int(curr_crime[k]/total_dist*100)) + "%)</b>") for k in sorted(curr_crime, key=curr_crime.get, reverse=True)][:5]
In [4]:
map_osm = folium.Map(location=[39.29, -76.61], zoom_start=12, tiles='stamentoner')
sex_legend = {'M': 'black','F':"#EBD7FA"}


# Gender overlay
for s in sample:
    for i in s.iterrows():
        element = i[1]
        coords = [element[15], element[16]]
        age = element[1]
        sex = element[2]
        race = element[3]
        arrest_date = element[4]
        
        description  = "<b>Age:</b>  %s</br><b>Sex:</b> %s</br><b>Race:</b> %s</br><b>Arrest Date:</b> %s" % (age,sex,race,arrest_date)
        folium.CircleMarker(
            location=coords,
            radius=8,
            popup= description,
            color='black',
            fill = True,
            fill_color=sex_legend[sex],
            fill_opacity= .65,
            weight = .9,
            icon=folium.Icon(color=sex_legend[sex],icon="remove-sign")).add_to(map_osm)

# District overlay
colors = {0:'blue', 1:'red', 2:'green', 3:'purple', 4:'darkpurple', 5:'black', 6:'lightgray', 7:'orange', 8:'lightgreen'}

for d in range(len(districts)-1):
    temp = districts[d].head(1)
    name = temp['district'].values[0]
    description = "<b>"+ name + "</b></br>" + "</br>".join(top_five[name])
    coords = [np.average(districts[d]['lat']),np.average(districts[d]['long'])]
    folium.Marker(coords,
                  popup=description,
                  icon=folium.Icon(
                      color=colors[d])).add_to(map_osm)


# Heatmap overlay
data = []
for i in random_sample.iterrows():
    element = i[1]
    coords = [element[15], element[16]]
    data.append(coords)
    
HeatMap(data).add_to(map_osm)
map_osm
Out[4]:

Analysis:

Realizing very early on through brute force and multiple piazza posts I could not blindly just use all of the data made available to us in the the arrest_table dataframe; it was way to much for my computer to process and make computations. So in order to fix this bottleneck I employed multiple techniques that we discussed in Professor Amol's and Professor John's lectures. The solution I adopted was one that pulled random samples of crime data from the 9 different districts found in the Baltimore City. I amortized the cost of the computations by iterating over the whole dataset and counted the number of occurrences of the 9 different districts and then scaled it by a factor of 1000 in which I just truncated the the counts by 3 places functionally (In essence if I had a count of 7615 I would do integer multiplication $7615//1000$ to yield 7). Now that I had a more molecular numbers to work with I scaled them again by a factor of 150 (or my sample size) to make the the sum of all the districts be 150. After this I make 9 distinct data frames containing random instances of the district present in the original dataframe; I only take n of them from the original data frame where n is the number i made after scaling by my sample size. Now we have more useable data which will make visualization much easier.

For the Folium map I incorporated 3 features to the map interface: a heat map to visualize where most of the crime happens, district makers that are computed by taking the average of all the coordinates latitudinal and longitudinally while also gathering stats on the district, and random instances of crime that I grab from my 9 distinct random dataframes. Commenting on the heat map. I made this by looking and multiple examples of this feature plugin of the folium map. I first gained and large enough sample size from the original dataframe, in my case it was 500 (this was because more 500 would make my heatmap too dense and yield a lot of red zones or high crime activity and the computation and time average would be to high and less than 500 would not be enough info and would yield a lot of discontinuous blue clusters and wouldn’t give enough information). I then integrated this data by overlaying it over our original map. I had to change the default tile that the folium map provides us because the color that it provides us supersedes the heat map and renders it useless. the Tile I use is called “stamentoner” which basically makes the map black and white and lets any color overlay stand out quite nicely, i also like it because it looks better when you try to identify smaller details in the map. The way I overplayed the district data was by taking the average of all the coordinates. this method worked quite nicely as it preserved the general cardinal locations of the different districts (as the different district are labeled according to there locations). I then mark the districts i computed whit the native marker and colored them differently to make the interface look as if it discretized the zones. I also added information that I gather from our dataframe that tally the top 5 instances of that specific district and computes there percentage, glancing over our data it seems that the Baltimore areas top crime offense is Narcotics with the police code 87. It was interesting to make this because the only text hat the marker popup takes is html. Finally the Gender overlay is just discretizing our data from the 9 different dataframes to crimes of Males (Black marker) and females (White Marker). I also give more insight on the individual by providing personal non identifying information about the race the age and the arrest date.