Load packages

In [1]:
import pandas as pd #importing pandas package
import numpy as np #importing numpy package

Read CSV

In [2]:
data = pd.read_csv("https://data.smartcolumbusos.com/api/v1/dataset/dd66bd77-7fa2-4b57-9a21-8d1606430d5c/download?_format=csv")

Exploration

In [3]:
data.shape #shape of the whole dataframe (row, column)
Out[3]:
(36, 5)
In [4]:
data
Out[4]:
age 11 categoriesdeathspopulationsexsort
055-6400Unknown66
125-34221749931Female8
2< 1070647Male26
365-746576294Female16
4Unk00Female22
51-40273310Female2
6< 100Unknown52
785+00Unknown72
845-54133784260Female12
965-7434504736Male42
10Unk00Male48
1175-842233349Male44
1235-4400Unknown62
1335-44455685852Male36
14< 1067249Female0
1565-7400Unknown68
1675-841318103Female18
171-41286717Male28
185-1400Unknown56
1955-6488827746Female14
2025-3400Unknown60
2115-24152786133Male32
2245-5400Unknown64
2335-44174694268Female10
2425-34539756979Male34
2575-8400Unknown70
265-140740345Male30
271-400Unknown54
2845-54283763428Male38
2985+085885Male46
30Unk00Unknown74
3115-2400Unknown58
3285+0168262Female20
335-140710133Female4
3415-2470753123Female6
3555-64198777623Male40
In [5]:
data.columns
Out[5]:
Index(['age 11 categories', 'deaths', 'population', 'sex', 'sort'], dtype='object')
In [6]:
data.dtypes
Out[6]:
age 11 categories    object
deaths                int64
population            int64
sex                  object
sort                  int64
dtype: object
In [7]:
data.describe()
Out[7]:
deathspopulationsort
count36.00000036.00000036.00000
mean65.472222322621.47222237.00000
std130.523449338515.06921122.64004
min0.0000000.0000000.00000
25%0.0000000.00000017.50000
50%0.000000200805.50000037.00000
75%74.500000717686.00000056.50000
max539.000000827746.00000074.00000
In [8]:
data.head() #showing first 5
Out[8]:
age 11 categoriesdeathspopulationsexsort
055-6400Unknown66
125-34221749931Female8
2< 1070647Male26
365-746576294Female16
4Unk00Female22
In [9]:
data[:5] #showing from start to 5
Out[9]:
age 11 categoriesdeathspopulationsexsort
055-6400Unknown66
125-34221749931Female8
2< 1070647Male26
365-746576294Female16
4Unk00Female22
In [10]:
data.tail() #showing last 5
Out[10]:
age 11 categoriesdeathspopulationsexsort
3115-2400Unknown58
3285+0168262Female20
335-140710133Female4
3415-2470753123Female6
3555-64198777623Male40
In [11]:
data[-5:] #showing from last 5 to the end
Out[11]:
age 11 categoriesdeathspopulationsexsort
3115-2400Unknown58
3285+0168262Female20
335-140710133Female4
3415-2470753123Female6
3555-64198777623Male40
In [12]:
data.loc[12] #loc is as way to identify information on the 4th row in the data
Out[12]:
age 11 categories      35-44
deaths                     0
population                 0
sex                  Unknown
sort                      62
Name: 12, dtype: object
In [13]:
data.loc[12, 'deaths'] # what is the value for 'deaths' in the 4th row
Out[13]:
0

What is the population size of the 5th observation?

Let's look at sex

In [14]:
data["sex"].unique()
Out[14]:
array(['Unknown', 'Female', 'Male'], dtype=object)
In [15]:
data['sex'].value_counts()
Out[15]:
Unknown    12
Male       12
Female     12
Name: sex, dtype: int64
In [16]:
encode_sex = pd.get_dummies(data["sex"], prefix="encode") #get_dummies does this automatically, prefix comes before each new column name
In [17]:
data = pd.concat([data, encode_sex], axis = 1) #concat concatenates the columns, axis 1 is horizontal, axis 0 is vertical concatenation
In [18]:
data.head()
Out[18]:
age 11 categoriesdeathspopulationsexsortencode_Femaleencode_Maleencode_Unknown
055-6400Unknown66001
125-34221749931Female8100
2< 1070647Male26010
365-746576294Female16100
4Unk00Female22100
In [19]:
data.loc[data['sex']=="Unknown", "Missing"] = "yes"
data.loc[data['sex']!="Unknown", "Missing"] = "no"
In [20]:
data.head()
Out[20]:
age 11 categoriesdeathspopulationsexsortencode_Femaleencode_Maleencode_UnknownMissing
055-6400Unknown66001yes
125-34221749931Female8100no
2< 1070647Male26010no
365-746576294Female16100no
4Unk00Female22100no

Clean Age

In [21]:
data.loc[(data['age 11 categories'] == "< 1") | (data['age 11 categories'] == "1-4") | (data['age 11 categories'] == "5-14"), "age"] = "children"
In [22]:
otherAge = ('15-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+')
data.loc[data['age 11 categories'].isin(otherAge), "age"] = "old"
In [23]:
data[['age', 'age 11 categories']]
Out[23]:
ageage 11 categories
0old55-64
1old25-34
2children< 1
3old65-74
4NaNUnk
5children1-4
6children< 1
7old85+
8old45-54
9old65-74
10NaNUnk
11old75-84
12old35-44
13old35-44
14children< 1
15old65-74
16old75-84
17children1-4
18children5-14
19old55-64
20old25-34
21old15-24
22old45-54
23old35-44
24old25-34
25old75-84
26children5-14
27children1-4
28old45-54
29old85+
30NaNUnk
31old15-24
32old85+
33children5-14
34old15-24
35old55-64

Excercise

  • Create a new variable called "big_population" for population larger than the 75% percentile
  • use "value_counts()" to get a frequency table of "big_population"
  • create new variable called "age_groups" with three values: "child", "adolescent", and "adult"
  • make a crosstab of "age 11 categories" and "age_groups" to check

Crosstabs

In [24]:
data['deaths'].describe() #describe gives descriptive statistics
Out[24]:
count     36.000000
mean      65.472222
std      130.523449
min        0.000000
25%        0.000000
50%        0.000000
75%       74.500000
max      539.000000
Name: deaths, dtype: float64
In [25]:
pd.crosstab(data['age'], data['sex'])
Out[25]:
sexFemaleMaleUnknown
age
children333
old888

Visualization

In [26]:
import seaborn as sns # importing package 'seaborn'
import matplotlib.pyplot as plt # importing package matplotlib.pyplot
%matplotlib inline  
#to show graph in notebook
In [27]:
sns.distplot(data["population"], bins=3) #plot a distribution plot for population
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ef03f60>
In [28]:
sns.countplot(data=data, x="age", order=("children", "old")) #plot countplot for dayofweek
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1efc2b38>
In [29]:
sns.countplot(data=data, x="deaths")
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f0be048>
In [30]:
sns.countplot(data=data, x="deaths", hue='sex')
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f1a1320>
In [31]:
sns.pointplot(data=data, x="age", y="deaths", order=("children", "old"))
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1f1ee550>
In [32]:
sns.pointplot(data=data, x="age", y="deaths", hue ="sex", order=("children", "old"))
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a21450208>

Excercise

  • create different plots with "age_groups"

Out to CSV

In [33]:
data.columns #look at all column names
Out[33]:
Index(['age 11 categories', 'deaths', 'population', 'sex', 'sort',
       'encode_Female', 'encode_Male', 'encode_Unknown', 'Missing', 'age'],
      dtype='object')
In [34]:
data.to_csv("output.csv") #export out to csv file