Pandas Land
Introduction
This is Panda exercises:
Data Frames
In this section, I explore how to create data frames from different ways:
- list
- dictionary
- Json
- csv file
In addition to that basic Data Frame (DF) manipulations:
import pandas as pd
cols = { 'name': ['Ted', 'Mak', 'Nina','Leo']
,'age' : [50, 20, 33, 25] }
l_students= [{'name':'Tailor', 'grade':'10','math':60}
,{'name':'Lora', 'grade':'09','math':80}
,{'name':'Joe', 'grade':'11','math':56.90}
,{'name':'Tailor', 'grade':'11','math':68.98} ]
studemtDF = pd.DataFrame(l_students)
# read from the json
import json
json_students = json.dumps(l_students)
# [
# {
# "grade": "10",
# "name": "Tailor",
# "math": 60
# },
# {
# "grade": "09",
# "name": "Lora",
# "math": 80
# },
# {
# "grade": "11",
# "name": "Joe",
# "math": 56.9
# },
# {
# "grade": "11",
# "name": "Tailor",
# "math": 68.98
# }
# ]
json_std_df = pd.read_json(json_students)
# grade math name
# 0 10 60.00 Tailor
# 1 9 80.00 Lora
# 2 11 56.90 Joe
# 3 11 68.98 Tailor
json_cols = json.dumps(cols)
# {
# "age": [
# 50,
# 20,
# 33,
# 25
# ],
# "name": [
# "Ted",
# "Mak",
# "Nina",
# "Leo"
# ]
# }
json_cols_df = pd.read_json(json_cols)
# age name
# 0 50 Ted
# 1 20 Mak
# 2 33 Nina
# 3 25 Leo
peopleDF = pd.DataFrame(cols)
# age name
# 0 50 Ted
# 1 20 Mak
# 2 33 Nina
# 3 25 Leo
peopleDF.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 2 columns):
# age 4 non-null int64
# name 4 non-null object
# dtypes: int64(1), object(1)
# memory usage: 136.0+ bytes
peopleDF.columns
#Index([u'age', u'name'], dtype='object')
peopleDF.age
# 0 50
# 1 20
# 2 33
# 3 25
# Name: age, dtype: int64
#or
peopleDF['age']
#describe
peopleDF.describe()
# age
# count 4.000000
# mean 32.000000
# std 13.140269
# min 20.000000
# 25% 23.750000
# 50% 29.000000
# 75% 37.250000
# max 50.000000
#create new column to existing df
peopleDF['city'] = pd.Series(['Sydney','Canberra', 'Melbourne', 'Darwin'])
# age name city
# 0 50 Ted Sydney
# 1 20 Mak Canberra
# 2 33 Nina Melbourne
# 3 25 Leo Darwin
peopleDF['eat'] = pd.Series(['rice','bread','bread','meat'],index=[3,1,0,6])
# age name city eat
# 0 50 Ted Sydney bread
# 1 20 Mak Canberra bread
# 2 33 Nina Melbourne NaN
# 3 25 Leo Darwin rice
#broadcasting
peopleDF['country'] = 'Australia'
# age name city eat country
# 0 50 Ted Sydney bread Australia
# 1 20 Mak Canberra bread Australia
# 2 33 Nina Melbourne NaN Australia
# 3 25 Leo Darwin rice Australia
#slice DataFrame
counDF = peopleDF[['name', 'country']]
# name country
# 0 Ted Australia
# 1 Mak Australia
# 2 Nina Australia
# 3 Leo Australia
del counDF['name']
# country
# 0 Australia
# 1 Australia
# 2 Australia
# 3 Australia
#pop out the age, this mutate the peopleDF
peopleDF.pop('age')
# name city eat country
# 0 Ted Sydney bread Australia
# 1 Mak Canberra bread Australia
# 2 Nina Melbourne NaN Australia
# 3 Leo Darwin rice Australia
Selection and Projection
import pandas as pd
cols = { 'name': ['Ted', 'Mak', 'Nina','Leo']
,'age' : [50, 20, 33, 25] }
df = pd.DataFrame(cols)
df.name == 'Nina'
# 0 False
# 1 False
# 2 True
# 3 False
# Name: name, dtype: bool
#masking
df[df.name == 'Nina']
# age name
# 2 33 Nina
df.name.str.contains('e')
# 0 True
# 1 False
# 2 False
# 3 True
# Name: name, dtype: bool
df[(df.name.str.contains('e')) & (df.age < 30)]
# age name
# 3 25 Leo
# df[(df.name.str.contains('e')) and (df.age < 30)] will not work because python 'and'.
#from numexpr (pip install numexpr if not available. My case it was by default)
df.query('age > 25')
# age name
# 0 50 Ted
# 2 33 Nina
# and <==> &
df.query('age > 25 and name == "Ted"')
# age name
# 0 50 Ted
# or <==> |
df.query('age > 25 | name == "Leo"')
# age name
# 0 50 Ted
# 2 33 Nina
# 3 25 Leo
# how not
~(df.name == 'Nina')
# 0 True
# 1 True
# 2 False
# 3 True
# Name: name, dtype: bool
(df.name != 'Nina') # same as above
df[~(df.name == 'Nina')]
# age name
# 0 50 Ted
# 1 20 Mak
# 3 25 Leo
df.query('not name == "Nina"') #same as above
#get last two rows
df.tail(2)
# age name
# 2 33 Nina
# 3 25 Leo
# get first two rows
df.head(2)
# age name
# 0 50 Ted
# 1 20 Mak
# or
df[:2] #slicing
#position based: get first and last
df.iloc[[0,3]]
# age name
# 0 50 Ted
# 3 25 Leo
#projection
df[['name', 'age']]
# name age
# 0 Ted 50
# 1 Mak 20
# 2 Nina 33
# 3 Leo 25
#position based projection
df.iloc[:2, :1] # two records with second column
# age
# 0 50
# 1 20
df.loc[:2, ['name']]
# name
# 0 Ted
# 1 Mak
# 2 Nina
Merge
import pandas as pd
cols_age = { 'name': ['Ted', 'Mak', 'Nina','Leo',"Phips"]
,'age' : [50, 20, 33, 25,40] }
df_age = pd.DataFrame(cols_age)
cols_marks = { 'name': ['Ted', 'Mak', 'Nina','Leo','Rino']
,'marks' : [78.50, 80.90, 33.89, 25, 67] }
df_marks = pd.DataFrame(cols_marks)
pd.merge(df_age,df_marks, on='name', how='inner')
# age name marks
# 0 50 Ted 78.50
# 1 20 Mak 80.90
# 2 33 Nina 33.89
# 3 25 Leo 25.00
pd.merge(df_marks,df_age, on='name')
# marks name age
# 0 78.50 Ted 50
# 1 80.90 Mak 20
# 2 33.89 Nina 33
# 3 25.00 Leo 25
pd.merge(df_age,df_marks, on='name', how='left')
# age name marks
# 0 50 Ted 78.50
# 1 20 Mak 80.90
# 2 33 Nina 33.89
# 3 25 Leo 25.00
# 4 40 Phips NaN
pd.merge(df_age,df_marks, on='name', how='right')
# age name marks
# 0 50.0 Ted 78.50
# 1 20.0 Mak 80.90
# 2 33.0 Nina 33.89
# 3 25.0 Leo 25.00
# 4 NaN Rino 67.00
pd.merge(df_age,df_marks, on='name', how='outer')
# age name marks
# 0 50.0 Ted 78.50
# 1 20.0 Mak 80.90
# 2 33.0 Nina 33.89
# 3 25.0 Leo 25.00
# 4 40.0 Phips NaN
# 5 NaN Rino 67.00
#copy df
df_fname = df_marks.copy(deep=True)
df_fname = df_fname.rename(columns={'name':'fname'})
pd.merge(df_fname,df_age, left_on='fname', right_on='name')
# marks fname age name
# 0 78.50 Ted 50 Ted
# 1 80.90 Mak 20 Mak
# 2 33.89 Nina 33 Nina
# 3 25.00 Leo 25 Leo
Let’s do more with the merge operation similar to database join
import numpy as np
import pandas as pd
df_emp = pd.DataFrame({
'name':['Alex','Clara','Dany','John','Philip','Marko'],
'age':[45,35,23,39,43,21],
'comp_id':['001','002','003','004','001','004']
})
df_comp = pd.DataFrame({
'comp_id':['001','002','003','004'],
'state':['ACT','ACT','NSW','NSW'],
'name':['DHS','IMMG','ABC','Ticketek']
})
#concat
pd.concat([df_emp,df_comp])
# age comp_id name state
# 0 45.0 001 Alex NaN
# 1 35.0 002 Clara NaN
# 2 23.0 003 Dany NaN
# 3 39.0 004 John NaN
# 4 43.0 001 Philip NaN
# 5 21.0 004 Marko NaN
# 0 NaN 001 DHS ACT
# 1 NaN 002 IMMG ACT
# 2 NaN 003 ABC NSW
# 3 NaN 004 Ticketek NSW
pd.concat([df_emp,df_comp],verify_integrity=True)
#error because index overlapping
#therefore ingnore the index if necessary
pd.concat([df_emp,df_comp],ignore_index=True)
pd.concat([df_emp,df_comp],join='inner')
# comp_id name
# 0 001 Alex
# 1 002 Clara
# 2 003 Dany
# 3 004 John
# 4 001 Philip
# 5 004 Marko
# 0 001 DHS
# 1 002 IMMG
# 2 003 ABC
# 3 004 Ticketek
pd.concat([df_emp,df_comp], axis=1)
# age comp_id name comp_id name state
# 0 45 001 Alex 001 DHS ACT
# 1 35 002 Clara 002 IMMG ACT
# 2 23 003 Dany 003 ABC NSW
# 3 39 004 John 004 Ticketek NSW
# 4 43 001 Philip NaN NaN NaN
# 5 21 004 Marko NaN NaN NaN
pd.concat([df_emp,df_comp], axis=1, join='inner')
# age comp_id name comp_id name state
# 0 45 001 Alex 001 DHS ACT
# 1 35 002 Clara 002 IMMG ACT
# 2 23 003 Dany 003 ABC NSW
# 3 39 004 John 004 Ticketek NSW
df_emp.merge(df_comp,how='inner', indicator=True,
left_on='comp_id', right_on='comp_id', suffixes=('_emp', '_comp'))
# age comp_id name_emp name_comp state _merge
# 0 45 001 Alex DHS ACT both
# 1 43 001 Philip DHS ACT both
# 2 35 002 Clara IMMG ACT both
# 3 23 003 Dany ABC NSW both
# 4 39 004 John Ticketek NSW both
# 5 21 004 Marko Ticketek NSW both
df_emp.join(df_comp, lsuffix='_emp', rsuffix='_comp', on='comp_id', how='left')
Operaions to Data Frames
- Adding, renaming and delete column
- changes to categorical type
- axes and sort operations
- group by operation
- append and concat of two DFs
- iterate over columns and rows
import pandas as pd
'''
National Assessment Program – Literacy and Numeracy (NAPLAN) - 2010 data
URL: https://data.gov.au/dataset/education-national-assessment-program-literacy-and-numeracy-nsw
'''
df = pd.read_csv('./education/gender-Table 1.csv'
, names=['Year','Gender','Reading', 'Numeracy']
, skiprows=[0], usecols=[0,1,2,3]
)
# Year Gender Reading Numeracy
# 0 Year 3 Male 94.0 94.5
# 1 Year 3 Female 96.6 95.6
# 2 Year 5 Male 91.3 94.4
# 3 Year 5 Female 94.8 94.9
# 4 Year 7 Male 93.7 94.7
# 5 Year 7 Female 96.4 95.1
# 6 Year 9 Male 88.9 93.4
# 7 Year 9 Female 93.6 92.8
df.Gender.value_counts()
# Male 4
# Female 4
# Name: Gender, dtype: int64
#change to the categorical type
df['Gender'] = df.Gender.astype('category')
df.shape
#rows and cols : (8, 4)
df.axes
# [RangeIndex(start=0, stop=8, step=1),
# Index([u'Year', u'Gender', u'Reading', u'Numeracy'], dtype='object')]
df.axes[0]
#RangeIndex(start=0, stop=8, step=1)
#or
df.index
df.axes[0] is df.index
#True
df.axes[1]
#Index([u'Year', u'Gender', u'Reading', u'Numeracy'], dtype='object')
df.sort_index(axis=1)
df.index.unique()
df.index.duplicated().any()
df.index.duplicated().all()
df.columns
df.columns.duplicated()
df.describe()
# Reading Numeracy
# count 8.000000 8.000000
# mean 93.662500 94.425000
# std 2.557866 0.913001
# min 88.900000 92.800000
# 25% 93.025000 94.150000
# 50% 93.850000 94.600000
# 75% 95.200000 94.950000
# max 96.600000 95.600000
df.describe(include='all')
# Year Gender Reading Numeracy
# count 8 8 8.000000 8.000000
# unique 4 2 NaN NaN
# top Year 3 Male NaN NaN
# freq 2 4 NaN NaN
# mean NaN NaN 93.662500 94.425000
# std NaN NaN 2.557866 0.913001
# min NaN NaN 88.900000 92.800000
# 25% NaN NaN 93.025000 94.150000
# 50% NaN NaN 93.850000 94.600000
# 75% NaN NaN 95.200000 94.950000
# max NaN NaN 96.600000 95.600000
df.groupby('Gender').sum()
# Reading Numeracy
# Gender
# Female 381.4 378.4
# Male 367.9 377.0
%matplotlib inline
ax = df['Reading'].plot(kind='box')
df_au = pd.read_csv('./education/australia-Table 1.csv'
, names=['Year','Cat','Reading', 'Numeracy']
, skiprows=[0], usecols=[0,1,2,3]
)
df_ind = pd.read_csv('./education/indigenous-Table 1.csv'
, names=['Year','Cat','Reading', 'Numeracy']
, skiprows=[0], usecols=[0,1,2,3]
)
df_au.append(df_ind)
# Year Cat Reading Numeracy
# 0 Year 3 Australia 93.9 94.3
# 1 Year 3 NSW 95.3 95.0
# 2 Year 5 Australia 91.3 93.7
# ...
# 0 Year 3 Aboriginal 85.5 83.9
# 1 Year 3 non-Aboriginal 95.7 95.6
# 2 Year 5 Aboriginal 77.7 80.9
# ...
# same above
pd.concat([df_au,df_ind])
#reset the index
df_nsw=pd.concat([df_au,df_ind],ignore_index=True)
df_nsw.iloc[1]
# Year Year 3
# Cat NSW
# Reading 95.3
# Numeracy 95
# Name: 1, dtype: object
#update existing
df_nsw.iloc[1] = ['Year 3','NSW',99,98]
#to add new
len(df)
#8
#add a record
df.loc[8] = ['Year 3','NSW',99,98]
# Year Gender Reading Numeracy
# 0 Year 3 Male 94.0 94.5
# 1 Year 3 Female 96.6 95.6
# 2 Year 5 Male 91.3 94.4
# 3 Year 5 Female 94.8 94.9
# 4 Year 7 Male 93.7 94.7
# 5 Year 7 Female 96.4 95.1
# 6 Year 9 Male 88.9 93.4
# 7 Year 9 Female 93.6 92.8
# 8 Year 3 NaN 99.0 98.0
#add a new column. You will get an error if the list is in different dimension
df['given'] =[2,3,4,9,7,5,6,2,4]
#create a column with one default value
df['given2'] = 2
#or iterate such as
df.loc[2:5]
df.loc[:,'given3'] = '3'
#calculated column
df['avg']=(df['Reading']+df['Numeracy'])/2
#based on function
def tot(cols):
read, nume = cols
return read + nume
#this will create new series which can be added to the df
d = df[['Reading', 'Numeracy']].apply(tot,axis=1)
#filter the df
df[['Year','Reading']]
#but to mutate the df and delete the column
df.pop('given')
#same as above but not return the deleted column as above
del df['given2']
#rename the column
df.rename(columns={'given3': 'given'})
#rename the index
df.sort_index(ascending=False)
df.sort_values('Gender', ascending=False)
# Year Gender Reading Numeracy given3 avg
# 6 Year 9 Male 88.9 93.4 3 91.15
# 4 Year 7 Male 93.7 94.7 3 94.20
# 2 Year 5 Male 91.3 94.4 3 92.85
# 0 Year 3 Male 94.0 94.5 3 94.25
# 7 Year 9 Female 93.6 92.8 3 93.20
# 5 Year 7 Female 96.4 95.1 3 95.75
# 3 Year 5 Female 94.8 94.9 3 94.85
# 1 Year 3 Female 96.6 95.6 3 96.10
# 8 Year 3 NaN 99.0 98.0 3 98.50
df['given'] = [5,3,4,6,3,4,5,1,2]
df.sort_values(['given', 'Gender'], ascending=[False, False])
#column sort
df.sort_index(axis=1)
# Gender Numeracy Reading Year
# 0 Male 94.5 94.0 Year 3
# 1 Female 95.6 96.6 Year 3
# 2 Male 94.4 91.3 Year 5
# 3 Female 94.9 94.8 Year 5
# 4 Male 94.7 93.7 Year 7
# 5 Female 95.1 96.4 Year 7
# 6 Male 93.4 88.9 Year 9
# 7 Female 92.8 93.6 Year 9
#if you iterate the data frame
list(df)
#['Year', 'Gender', 'Reading', 'Numeracy']
df.iteritems #iterate over column
# for example
for name, col in df.iteritems():
print (name, col)
break
# ('Year', 0 Year 3
# 1 Year 3
# 2 Year 5
# 3 Year 5
# 4 Year 7
# 5 Year 7
# 6 Year 9
# 7 Year 9
# Name: Year, dtype: object)
#to iterate over rows
for tup in df.itertuples():
print(tup)
# Pandas(Index=0, Year='Year 3', Gender='Male', Reading=94.0, Numeracy=94.5)
# Pandas(Index=1, Year='Year 3', Gender='Female', Reading=96.599999999999994, Numeracy=95.599999999999994)
# Pandas(Index=2, Year='Year 5', Gender='Male', Reading=91.299999999999997, Numeracy=94.400000000000006)
# Pandas(Index=3, Year='Year 5', Gender='Female', Reading=94.799999999999997, Numeracy=94.900000000000006)
# Pandas(Index=4, Year='Year 7', Gender='Male', Reading=93.700000000000003, Numeracy=94.700000000000003)
# Pandas(Index=5, Year='Year 7', Gender='Female', Reading=96.400000000000006, Numeracy=95.099999999999994)
# Pandas(Index=6, Year='Year 9', Gender='Male', Reading=88.900000000000006, Numeracy=93.400000000000006)
# Pandas(Index=7, Year='Year 9', Gender='Female', Reading=93.599999999999994, Numeracy=92.799999999999997)
for row in df.iterrows():
print(row)
# (0, Year Year 3
# Gender Male
# Reading 94
# Numeracy 94.5
# Name: 0, dtype: object)
# (1, Year Year 3
# Gender Female
# Reading 96.6
# Numeracy 95.6
# Name: 1, dtype: object)
# ...
#set index to other column
import string
df_au['num']= [x for x in string.lowercase[0:8]]
df_au =df_au.set_index('num')
# Year Cat Reading Numeracy
# num
# a Year 3 Australia 93.9 94.3
# b Year 3 NSW 95.3 95.0
# c Year 5 Australia 91.3 93.7
# d Year 5 NSW 93.0 94.7
# e Year 7 Australia 94.9 95.1
# f Year 7 NSW 95.0 94.9
# g Year 9 Australia 90.8 93.1
# h Year 9 NSW 91.2 93.1
#set the cell value: this will mutate the DF
df_au.set_value('d','Reading',99.89)
# Year Cat Reading Numeracy
# num
# ...
# c Year 5 Australia 91.30 93.7
# d Year 5 NSW 99.89 94.7
# e Year 7 Australia 94.90 95.1
# ...
#set cell using loc
df_au.loc['b','Reading'] = 96.0
# Year Cat Reading Numeracy
# num
# a Year 3 Australia 93.90 94.3
# b Year 3 NSW 96.00 95.0
# c Year 5 Australia 91.30 93.7
# ...
df_au.loc['i','Reading'] = 96.0 #be careful not to add to the wrong index
# Year Cat Reading Numeracy
# num
# ...
# g Year 9 Australia 90.80 93.1
# h Year 9 NSW 91.20 93.1
# Year 5 NaN NaN 90.00 NaN
# i NaN NaN 96.00 NaN
#set cell using iloc above cell
df_au.iloc[1,2] = 90.0
# Year Cat Reading Numeracy
# num
# a Year 3 Australia 93.90 94.3
# b Year 3 NSW 90.00 95.0
# c Year 5 Australia 91.30 93.7
# ...
#be careful not to add to the wrong place
df_au.iloc[8,2] = 90.0
# Year Cat Reading Numeracy
# num
# ...
# g Year 9 Australia 90.80 93.1
# h Year 9 NSW 91.20 93.1
# Year 5 NaN NaN 90.00 NaN
Written with StackEdit.
Comments
Post a Comment
commented your blog