Fatal Traffic Accidents in the US using BigQuery

Feb 19th, 2018

In [1]:
import bq_helper as bq
us_traffic = bq.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="nhtsa_traffic_fatalities")

How many fatal traffic accidents were reported in each state?

In [2]:
query = """SELECT state_name AS State,
                  COUNT(consecutive_number) AS Accidents
            FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2016`
            GROUP BY state_name
            ORDER BY accidents DESC
            """
In [3]:
state_accidents = us_traffic.query_to_pandas(query)
state_accidents.head()
Out[3]:
State Accidents
0 Texas 3407
1 California 3357
2 Florida 2933
3 Georgia 1422
4 North Carolina 1348
In [4]:
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode
In [5]:
plt.style.use('ggplot')
In [6]:
states_dict = {
         'Alaska':'AK',
         'Alabama':'AL',
         'Arkansas':'AR',
         'Arizona':'AZ',
         'California':'CA',
         'Colorado':'CO',
         'Connecticut':'CT',
         'District of Columbia':'DC',
         'Delaware':'DE',
         'Florida':'FL',
         'Georgia':'GA',
         'Hawaii':'HI',
         'Iowa':'IA',
         'Idaho':'ID',
         'Illinois':'IL',
         'Indiana':'IN',
         'Kansas':'KS',
         'Kentucky':'KY',
         'Louisiana':'LA',
         'Massachusetts':'MA',
         'Maryland':'MD',
         'Maine':'ME',
         'Michigan':'MI',
         'Minnesota':'MN',
         'Missouri':'MO',
         'Mississippi':'MS',
         'Montana':'MT',
         'North Carolina':'NC',
         'North Dakota':'ND',
         'Nebraska':'NE',
         'New Hampshire':'NH',
         'New Jersey':'NJ',
         'New Mexico':'NM',
         'Nevada':'NV',
         'New York':'NY',
         'Ohio':'OH',
         'Oklahoma':'OK',
         'Oregon':'OR',
         'Pennsylvania':'PA',
         'Puerto Rico':'PR',
         'Rhode Island':'RI',
         'South Carolina':'SC',
         'South Dakota':'SD',
         'Tennessee':'TN',
         'Texas':'TX',
         'Utah':'UT',
         'Virginia':'VA',
         'Vermont':'VT',
         'Washington':'WA',
         'Wisconsin':'WI',
         'West Virginia':'WV',
         'Wyoming':'WY'
};
In [7]:
state_accidents['State'].replace(states_dict,inplace = True);
In [8]:
init_notebook_mode()
# plotly code for choropleth map
scale = [[0, 'rgb(220, 255, 178)'],[1, 'rgb(81, 127, 25)']]
data = [ dict(
        type = 'choropleth',
        colorscale = scale,
        autocolorscale = False,
        showscale = False,
        locations = state_accidents['State'],
        z = state_accidents['Accidents'],
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255, 255, 255)',
                width = 2
            ) ),
        ) ]
layout = dict(
        title = 'Fatal Traffic Accidents in United States (2016)',
        geo = dict(
            scope = 'usa',
            projection = dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)',
            countrycolor = 'rgb(255, 255, 255)')        
             )
 
figure = dict(data=data, layout=layout)
iplot(figure)

What day of the week do most of the accidents occur?

In [9]:
query2 = """SELECT EXTRACT(DAYOFWEEK FROM timestamp_of_crash) AS Day_of_week,
                  CASE 
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 1 THEN 'Sunday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 2 THEN 'Monday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 3 THEN 'Tuesday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 4 THEN 'Wednesday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 5 THEN 'Thursday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 6 THEN 'Friday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 7 THEN 'Saturday'
                  END Weekday,
                  COUNT(consecutive_number) AS Accidents
           FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2016`
           GROUP BY Day_of_week,Weekday
           ORDER BY Accidents DESC
"""
In [10]:
day_of_week_accidents = us_traffic.query_to_pandas(query2)
day_of_week_accidents
Out[10]:
Day_of_week Weekday Accidents
0 7 Saturday 6104
1 6 Friday 5352
2 1 Sunday 5303
3 5 Thursday 4662
4 2 Monday 4501
5 4 Wednesday 4388
6 3 Tuesday 4129
In [11]:
day_of_week_accidents.sort_values('Day_of_week').plot.bar('Weekday','Accidents')
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb9712984a8>

What times in the day is the worst for these traffic accidents?

In [12]:
query3 = """SELECT COUNT(consecutive_number) AS Accidents,
                   EXTRACT(HOUR FROM timestamp_of_crash) AS Hour
           FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2016`
           GROUP BY Hour
           ORDER BY Accidents DESC
"""
In [13]:
hour_accidents = us_traffic.query_to_pandas(query3)
hour_accidents
Out[13]:
Accidents Hour
0 1984 18
1 1919 20
2 1913 17
3 1902 19
4 1808 16
5 1801 21
6 1731 15
7 1650 22
8 1612 0
9 1567 14
10 1514 13
11 1483 23
12 1353 2
13 1314 12
14 1287 6
15 1264 1
16 1228 11
17 1142 5
18 1139 7
19 1021 3
20 1016 10
21 989 9
22 961 8
23 841 4
In [14]:
hour_accidents.sort_values('Hour').plot.bar('Hour','Accidents')
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb971298828>