Load packages

In [1]:
import pandas as pd #importing pandas package
import numpy as np #importing numpy package

Read CSV

In [2]:
data = pd.read_csv("https://data.smartcolumbusos.com/api/v1/dataset/dd66bd77-7fa2-4b57-9a21-8d1606430d5c/download?_format=csv")

Exploration

In [3]:
data.shape #shape of the whole dataframe (row, column)
Out[3]:
(36, 5)
In [4]:
data
Out[4]:
age 11 categories deaths population sex sort
0 55-64 0 0 Unknown 66
1 25-34 221 749931 Female 8
2 < 1 0 70647 Male 26
3 65-74 6 576294 Female 16
4 Unk 0 0 Female 22
5 1-4 0 273310 Female 2
6 < 1 0 0 Unknown 52
7 85+ 0 0 Unknown 72
8 45-54 133 784260 Female 12
9 65-74 34 504736 Male 42
10 Unk 0 0 Male 48
11 75-84 2 233349 Male 44
12 35-44 0 0 Unknown 62
13 35-44 455 685852 Male 36
14 < 1 0 67249 Female 0
15 65-74 0 0 Unknown 68
16 75-84 1 318103 Female 18
17 1-4 1 286717 Male 28
18 5-14 0 0 Unknown 56
19 55-64 88 827746 Female 14
20 25-34 0 0 Unknown 60
21 15-24 152 786133 Male 32
22 45-54 0 0 Unknown 64
23 35-44 174 694268 Female 10
24 25-34 539 756979 Male 34
25 75-84 0 0 Unknown 70
26 5-14 0 740345 Male 30
27 1-4 0 0 Unknown 54
28 45-54 283 763428 Male 38
29 85+ 0 85885 Male 46
30 Unk 0 0 Unknown 74
31 15-24 0 0 Unknown 58
32 85+ 0 168262 Female 20
33 5-14 0 710133 Female 4
34 15-24 70 753123 Female 6
35 55-64 198 777623 Male 40
In [5]:
data.columns
Out[5]:
Index(['age 11 categories', 'deaths', 'population', 'sex', 'sort'], dtype='object')
In [6]:
data.dtypes
Out[6]:
age 11 categories    object
deaths                int64
population            int64
sex                  object
sort                  int64
dtype: object
In [7]:
data.describe()
Out[7]:
deaths population sort
count 36.000000 36.000000 36.00000
mean 65.472222 322621.472222 37.00000
std 130.523449 338515.069211 22.64004
min 0.000000 0.000000 0.00000
25% 0.000000 0.000000 17.50000
50% 0.000000 200805.500000 37.00000
75% 74.500000 717686.000000 56.50000
max 539.000000 827746.000000 74.00000
In [8]:
data.head() #showing first 5
Out[8]:
age 11 categories deaths population sex sort
0 55-64 0 0 Unknown 66
1 25-34 221 749931 Female 8
2 < 1 0 70647 Male 26
3 65-74 6 576294 Female 16
4 Unk 0 0 Female 22
In [9]:
data[:5] #showing from start to 5
Out[9]:
age 11 categories deaths population sex sort
0 55-64 0 0 Unknown 66
1 25-34 221 749931 Female 8
2 < 1 0 70647 Male 26
3 65-74 6 576294 Female 16
4 Unk 0 0 Female 22
In [10]:
data.tail() #showing last 5
Out[10]:
age 11 categories deaths population sex sort
31 15-24 0 0 Unknown 58
32 85+ 0 168262 Female 20
33 5-14 0 710133 Female 4
34 15-24 70 753123 Female 6
35 55-64 198 777623 Male 40
In [11]:
data[-5:] #showing from last 5 to the end
Out[11]:
age 11 categories deaths population sex sort
31 15-24 0 0 Unknown 58
32 85+ 0 168262 Female 20
33 5-14 0 710133 Female 4
34 15-24 70 753123 Female 6
35 55-64 198 777623 Male 40
In [12]:
data.loc[12] #loc is as way to identify information on the 4th row in the data
Out[12]:
age 11 categories      35-44
deaths                     0
population                 0
sex                  Unknown
sort                      62
Name: 12, dtype: object
In [13]:
data.loc[12, 'deaths'] # what is the value for 'deaths' in the 4th row
Out[13]:
0

What is the population size of the 5th observation?

Let's look at sex

In [14]:
data["sex"].unique()
Out[14]:
array(['Unknown', 'Female', 'Male'], dtype=object)
In [15]:
data['sex'].value_counts()
Out[15]:
Unknown    12
Male       12
Female     12
Name: sex, dtype: int64
In [16]:
encode_sex = pd.get_dummies(data["sex"], prefix="encode") #get_dummies does this automatically, prefix comes before each new column name
In [17]:
data = pd.concat([data, encode_sex], axis = 1) #concat concatenates the columns, axis 1 is horizontal, axis 0 is vertical concatenation
In [18]:
data.head()
Out[18]:
age 11 categories deaths population sex sort encode_Female encode_Male encode_Unknown
0 55-64 0 0 Unknown 66 0 0 1
1 25-34 221 749931 Female 8 1 0 0
2 < 1 0 70647 Male 26 0 1 0
3 65-74 6 576294 Female 16 1 0 0
4 Unk 0 0 Female 22 1 0 0
In [19]:
data.loc[data['sex']=="Unknown", "Missing"] = "yes"
data.loc[data['sex']!="Unknown", "Missing"] = "no"
In [20]:
data.head()
Out[20]:
age 11 categories deaths population sex sort encode_Female encode_Male encode_Unknown Missing
0 55-64 0 0 Unknown 66 0 0 1 yes
1 25-34 221 749931 Female 8 1 0 0 no
2 < 1 0 70647 Male 26 0 1 0 no
3 65-74 6 576294 Female 16 1 0 0 no
4 Unk 0 0 Female 22 1 0 0 no

Clean Age

In [21]:
data.loc[(data['age 11 categories'] == "< 1") | (data['age 11 categories'] == "1-4") | (data['age 11 categories'] == "5-14"), "age"] = "children"
In [22]:
otherAge = ('15-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+')
data.loc[data['age 11 categories'].isin(otherAge), "age"] = "old"
In [23]:
data[['age', 'age 11 categories']]
Out[23]:
age age 11 categories
0 old 55-64
1 old 25-34
2 children < 1
3 old 65-74
4 NaN Unk
5 children 1-4
6 children < 1
7 old 85+
8 old 45-54
9 old 65-74
10 NaN Unk
11 old 75-84
12 old 35-44
13 old 35-44
14 children < 1
15 old 65-74
16 old 75-84
17 children 1-4
18 children 5-14
19 old 55-64
20 old 25-34
21 old 15-24
22 old 45-54
23 old 35-44
24 old 25-34
25 old 75-84
26 children 5-14
27 children 1-4
28 old 45-54
29 old 85+
30 NaN Unk
31 old 15-24
32 old 85+
33 children 5-14
34 old 15-24
35 old 55-64

Excercise

  • Create a new variable called "big_population" for population larger than the 75% percentile
  • use "value_counts()" to get a frequency table of "big_population"
  • create new variable called "age_groups" with three values: "child", "adolescent", and "adult"
  • make a crosstab of "age 11 categories" and "age_groups" to check

Crosstabs

In [24]:
data['deaths'].describe() #describe gives descriptive statistics
Out[24]:
count     36.000000
mean      65.472222
std      130.523449
min        0.000000
25%        0.000000
50%        0.000000
75%       74.500000
max      539.000000
Name: deaths, dtype: float64
In [25]:
pd.crosstab(data['age'], data['sex'])
Out[25]:
sex Female Male Unknown
age
children 3 3 3
old 8 8 8

Visualization

In [26]:
import seaborn as sns # importing package 'seaborn'
import matplotlib.pyplot as plt # importing package matplotlib.pyplot
%matplotlib inline  
#to show graph in notebook
In [27]:
sns.distplot(data["population"], bins=3) #plot a distribution plot for population
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ef03f60>
In [28]:
sns.countplot(data=data, x="age", order=("children", "old")) #plot countplot for dayofweek
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1efc2b38>
In [29]:
sns.countplot(data=data, x="deaths")
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f0be048>
In [30]:
sns.countplot(data=data, x="deaths", hue='sex')
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f1a1320>
In [31]:
sns.pointplot(data=data, x="age", y="deaths", order=("children", "old"))
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f1ee550>
In [32]:
sns.pointplot(data=data, x="age", y="deaths", hue ="sex", order=("children", "old"))
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a21450208>

Excercise

  • create different plots with "age_groups"

Out to CSV

In [33]:
data.columns #look at all column names
Out[33]:
Index(['age 11 categories', 'deaths', 'population', 'sex', 'sort',
       'encode_Female', 'encode_Male', 'encode_Unknown', 'Missing', 'age'],
      dtype='object')
In [34]:
data.to_csv("output.csv") #export out to csv file