Load packages¶

import pandas as pd #importing pandas package
import numpy as np #importing numpy package

Read CSV¶

data = pd.read_csv("https://data.smartcolumbusos.com/api/v1/dataset/dd66bd77-7fa2-4b57-9a21-8d1606430d5c/download?_format=csv")

Exploration¶

data.shape #shape of the whole dataframe (row, column)

(36, 5)

data

data.columns

Index(['age 11 categories', 'deaths', 'population', 'sex', 'sort'], dtype='object')

data.dtypes

age 11 categories    object
deaths                int64
population            int64
sex                  object
sort                  int64
dtype: object

data.describe()

data.head() #showing first 5

data[:5] #showing from start to 5

data.tail() #showing last 5

data[-5:] #showing from last 5 to the end

data.loc[12] #loc is as way to identify information on the 4th row in the data

age 11 categories      35-44
deaths                     0
population                 0
sex                  Unknown
sort                      62
Name: 12, dtype: object

data.loc[12, 'deaths'] # what is the value for 'deaths' in the 4th row

0

What is the population size of the 5th observation?¶

Let's look at sex¶

data["sex"].unique()

array(['Unknown', 'Female', 'Male'], dtype=object)

data['sex'].value_counts()

Unknown    12
Male       12
Female     12
Name: sex, dtype: int64

encode_sex = pd.get_dummies(data["sex"], prefix="encode") #get_dummies does this automatically, prefix comes before each new column name

data = pd.concat([data, encode_sex], axis = 1) #concat concatenates the columns, axis 1 is horizontal, axis 0 is vertical concatenation

data.head()

data.loc[data['sex']=="Unknown", "Missing"] = "yes"
data.loc[data['sex']!="Unknown", "Missing"] = "no"

data.head()

Clean Age¶

data.loc[(data['age 11 categories'] == "< 1") | (data['age 11 categories'] == "1-4") | (data['age 11 categories'] == "5-14"), "age"] = "children"

otherAge = ('15-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+')
data.loc[data['age 11 categories'].isin(otherAge), "age"] = "old"

data[['age', 'age 11 categories']]

Excercise¶

Create a new variable called "big_population" for population larger than the 75% percentile
use "value_counts()" to get a frequency table of "big_population"
create new variable called "age_groups" with three values: "child", "adolescent", and "adult"
make a crosstab of "age 11 categories" and "age_groups" to check

Crosstabs¶

data['deaths'].describe() #describe gives descriptive statistics

count     36.000000
mean      65.472222
std      130.523449
min        0.000000
25%        0.000000
50%        0.000000
75%       74.500000
max      539.000000
Name: deaths, dtype: float64

pd.crosstab(data['age'], data['sex'])

Visualization¶

import seaborn as sns # importing package 'seaborn'
import matplotlib.pyplot as plt # importing package matplotlib.pyplot
%matplotlib inline  
#to show graph in notebook

sns.distplot(data["population"], bins=3) #plot a distribution plot for population

<matplotlib.axes._subplots.AxesSubplot at 0x1a1ef03f60>

sns.countplot(data=data, x="age", order=("children", "old")) #plot countplot for dayofweek

<matplotlib.axes._subplots.AxesSubplot at 0x1a1efc2b38>

sns.countplot(data=data, x="deaths")

<matplotlib.axes._subplots.AxesSubplot at 0x1a1f0be048>

sns.countplot(data=data, x="deaths", hue='sex')

<matplotlib.axes._subplots.AxesSubplot at 0x1a1f1a1320>

sns.pointplot(data=data, x="age", y="deaths", order=("children", "old"))

<matplotlib.axes._subplots.AxesSubplot at 0x1a1f1ee550>

sns.pointplot(data=data, x="age", y="deaths", hue ="sex", order=("children", "old"))

<matplotlib.axes._subplots.AxesSubplot at 0x1a21450208>

Excercise¶

create different plots with "age_groups"

Out to CSV¶

data.columns #look at all column names

Index(['age 11 categories', 'deaths', 'population', 'sex', 'sort',
       'encode_Female', 'encode_Male', 'encode_Unknown', 'Missing', 'age'],
      dtype='object')

data.to_csv("output.csv") #export out to csv file

	age 11 categories	deaths	population	sex	sort
0	55-64	0	0	Unknown	66
1	25-34	221	749931	Female	8
2	< 1	0	70647	Male	26
3	65-74	6	576294	Female	16
4	Unk	0	0	Female	22
5	1-4	0	273310	Female	2
6	< 1	0	0	Unknown	52
7	85+	0	0	Unknown	72
8	45-54	133	784260	Female	12
9	65-74	34	504736	Male	42
10	Unk	0	0	Male	48
11	75-84	2	233349	Male	44
12	35-44	0	0	Unknown	62
13	35-44	455	685852	Male	36
14	< 1	0	67249	Female	0
15	65-74	0	0	Unknown	68
16	75-84	1	318103	Female	18
17	1-4	1	286717	Male	28
18	5-14	0	0	Unknown	56
19	55-64	88	827746	Female	14
20	25-34	0	0	Unknown	60
21	15-24	152	786133	Male	32
22	45-54	0	0	Unknown	64
23	35-44	174	694268	Female	10
24	25-34	539	756979	Male	34
25	75-84	0	0	Unknown	70
26	5-14	0	740345	Male	30
27	1-4	0	0	Unknown	54
28	45-54	283	763428	Male	38
29	85+	0	85885	Male	46
30	Unk	0	0	Unknown	74
31	15-24	0	0	Unknown	58
32	85+	0	168262	Female	20
33	5-14	0	710133	Female	4
34	15-24	70	753123	Female	6
35	55-64	198	777623	Male	40

	deaths	population	sort
count	36.000000	36.000000	36.00000
mean	65.472222	322621.472222	37.00000
std	130.523449	338515.069211	22.64004
min	0.000000	0.000000	0.00000
25%	0.000000	0.000000	17.50000
50%	0.000000	200805.500000	37.00000
75%	74.500000	717686.000000	56.50000
max	539.000000	827746.000000	74.00000

	age	age 11 categories
0	old	55-64
1	old	25-34
2	children	< 1
3	old	65-74
4	NaN	Unk
5	children	1-4
6	children	< 1
7	old	85+
8	old	45-54
9	old	65-74
10	NaN	Unk
11	old	75-84
12	old	35-44
13	old	35-44
14	children	< 1
15	old	65-74
16	old	75-84
17	children	1-4
18	children	5-14
19	old	55-64
20	old	25-34
21	old	15-24
22	old	45-54
23	old	35-44
24	old	25-34
25	old	75-84
26	children	5-14
27	children	1-4
28	old	45-54
29	old	85+
30	NaN	Unk
31	old	15-24
32	old	85+
33	children	5-14
34	old	15-24
35	old	55-64