Fatal Traffic Accidents in the US using BigQuery¶

Feb 19th, 2018

In [1]:

import bq_helper as bq
us_traffic = bq.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="nhtsa_traffic_fatalities")

How many fatal traffic accidents were reported in each state?¶

In [2]:

query = """SELECT state_name AS State,
                  COUNT(consecutive_number) AS Accidents
            FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2016`
            GROUP BY state_name
            ORDER BY accidents DESC
            """

In [3]:

state_accidents = us_traffic.query_to_pandas(query)
state_accidents.head()

Out[3]:

	State	Accidents
0	Texas	3407
1	California	3357
2	Florida	2933
3	Georgia	1422
4	North Carolina	1348

In [4]:

import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode

In [5]:

plt.style.use('ggplot')

In [6]:

states_dict = {
         'Alaska':'AK',
         'Alabama':'AL',
         'Arkansas':'AR',
         'Arizona':'AZ',
         'California':'CA',
         'Colorado':'CO',
         'Connecticut':'CT',
         'District of Columbia':'DC',
         'Delaware':'DE',
         'Florida':'FL',
         'Georgia':'GA',
         'Hawaii':'HI',
         'Iowa':'IA',
         'Idaho':'ID',
         'Illinois':'IL',
         'Indiana':'IN',
         'Kansas':'KS',
         'Kentucky':'KY',
         'Louisiana':'LA',
         'Massachusetts':'MA',
         'Maryland':'MD',
         'Maine':'ME',
         'Michigan':'MI',
         'Minnesota':'MN',
         'Missouri':'MO',
         'Mississippi':'MS',
         'Montana':'MT',
         'North Carolina':'NC',
         'North Dakota':'ND',
         'Nebraska':'NE',
         'New Hampshire':'NH',
         'New Jersey':'NJ',
         'New Mexico':'NM',
         'Nevada':'NV',
         'New York':'NY',
         'Ohio':'OH',
         'Oklahoma':'OK',
         'Oregon':'OR',
         'Pennsylvania':'PA',
         'Puerto Rico':'PR',
         'Rhode Island':'RI',
         'South Carolina':'SC',
         'South Dakota':'SD',
         'Tennessee':'TN',
         'Texas':'TX',
         'Utah':'UT',
         'Virginia':'VA',
         'Vermont':'VT',
         'Washington':'WA',
         'Wisconsin':'WI',
         'West Virginia':'WV',
         'Wyoming':'WY'
};

In [7]:

state_accidents['State'].replace(states_dict,inplace = True);

In [8]:

init_notebook_mode()
# plotly code for choropleth map
scale = [[0, 'rgb(220, 255, 178)'],[1, 'rgb(81, 127, 25)']]
data = [ dict(
        type = 'choropleth',
        colorscale = scale,
        autocolorscale = False,
        showscale = False,
        locations = state_accidents['State'],
        z = state_accidents['Accidents'],
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255, 255, 255)',
                width = 2
            ) ),
        ) ]
layout = dict(
        title = 'Fatal Traffic Accidents in United States (2016)',
        geo = dict(
            scope = 'usa',
            projection = dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)',
            countrycolor = 'rgb(255, 255, 255)')        
             )
 
figure = dict(data=data, layout=layout)
iplot(figure)

What day of the week do most of the accidents occur?¶

In [9]:

query2 = """SELECT EXTRACT(DAYOFWEEK FROM timestamp_of_crash) AS Day_of_week,
                  CASE 
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 1 THEN 'Sunday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 2 THEN 'Monday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 3 THEN 'Tuesday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 4 THEN 'Wednesday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 5 THEN 'Thursday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 6 THEN 'Friday'
                    WHEN EXTRACT(DAYOFWEEK FROM timestamp_of_crash) = 7 THEN 'Saturday'
                  END Weekday,
                  COUNT(consecutive_number) AS Accidents
           FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2016`
           GROUP BY Day_of_week,Weekday
           ORDER BY Accidents DESC
"""

In [10]:

day_of_week_accidents = us_traffic.query_to_pandas(query2)
day_of_week_accidents

Out[10]:

	Day_of_week	Weekday	Accidents
0	7	Saturday	6104
1	6	Friday	5352
2	1	Sunday	5303
3	5	Thursday	4662
4	2	Monday	4501
5	4	Wednesday	4388
6	3	Tuesday	4129

In [11]:

day_of_week_accidents.sort_values('Day_of_week').plot.bar('Weekday','Accidents')

Out[11]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fb9712984a8>

What times in the day is the worst for these traffic accidents?¶

In [12]:

query3 = """SELECT COUNT(consecutive_number) AS Accidents,
                   EXTRACT(HOUR FROM timestamp_of_crash) AS Hour
           FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_2016`
           GROUP BY Hour
           ORDER BY Accidents DESC
"""

In [13]:

hour_accidents = us_traffic.query_to_pandas(query3)
hour_accidents

Out[13]:

	Accidents	Hour
0	1984	18
1	1919	20
2	1913	17
3	1902	19
4	1808	16
5	1801	21
6	1731	15
7	1650	22
8	1612	0
9	1567	14
10	1514	13
11	1483	23
12	1353	2
13	1314	12
14	1287	6
15	1264	1
16	1228	11
17	1142	5
18	1139	7
19	1021	3
20	1016	10
21	989	9
22	961	8
23	841	4

In [14]:

hour_accidents.sort_values('Hour').plot.bar('Hour','Accidents')

Out[14]:

<matplotlib.axes._subplots.AxesSubplot at 0x7fb971298828>